From 3934788a7b4df71f8bd7a2a1f1c0480f06a2076e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 10 Sep 2017 19:02:25 -0700 Subject: net: tcp_input: Neaten DBGUNDO Move the #ifdef into the static void function so that the use of DBGUNDO is validated when FASTRETRANS_DEBUG <= 1. Remove the now unnecessary #else and #define DBGUNDO. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c5d7656beeee..bddf724f5c02 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2333,9 +2333,9 @@ static bool tcp_any_retrans_done(const struct sock *sk) return false; } -#if FASTRETRANS_DEBUG > 1 static void DBGUNDO(struct sock *sk, const char *msg) { +#if FASTRETRANS_DEBUG > 1 struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -2357,10 +2357,8 @@ static void DBGUNDO(struct sock *sk, const char *msg) tp->packets_out); } #endif -} -#else -#define DBGUNDO(x...) do { } while (0) #endif +} static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { -- cgit v1.2.3 From 7016e0627171878810798a842a416dddee4e3329 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 13 Sep 2017 13:58:15 -0700 Subject: net: Convert int functions to bool Global function ipv6_rcv_saddr_equal and static functions ipv6_rcv_saddr_equal and ipv4_rcv_saddr_equal currently return int. bool is slightly more descriptive for these functions so change their return type from int to bool. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b9c64b40a83a..9707372b78ed 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -39,11 +39,11 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ -static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, - const struct in6_addr *sk2_rcv_saddr6, - __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk1_ipv6only, bool sk2_ipv6only, - bool match_wildcard) +static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, + const struct in6_addr *sk2_rcv_saddr6, + __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk1_ipv6only, bool sk2_ipv6only, + bool match_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -52,29 +52,29 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; + return true; if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; + return true; if (addr_type == IPV6_ADDR_ANY && match_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; + return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) - return 1; + return true; - return 0; + return false; } #endif @@ -82,20 +82,20 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk2_ipv6only, bool match_wildcard) +static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk2_ipv6only, bool match_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } -int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) +bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) -- cgit v1.2.3 From 7e5dd53f6ee81ffa0d1f42f79e6440b6a751ab40 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 12:40:38 +0100 Subject: net_sched: use explicit size of struct tcmsg, remove need to declare tcm Pointer tcm is being initialized and is never read, it is only being used to determine the size of struct tcmsg. Clean this up by removing variable tcm and explicitly using the sizeof struct tcmsg rather than *tcm. Cleans up clang warning: warning: Value stored to 'tcm' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/sched/sch_api.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c6deb74e3d2f..aa82116ed10c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1500,7 +1500,6 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) int s_idx, s_q_idx; struct net_device *dev; const struct nlmsghdr *nlh = cb->nlh; - struct tcmsg *tcm = nlmsg_data(nlh); struct nlattr *tca[TCA_MAX + 1]; int err; @@ -1510,7 +1509,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL); if (err < 0) return err; -- cgit v1.2.3 From 3c75f6ee139d464351f8ab77a042dd3635769d98 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Sep 2017 12:36:22 -0700 Subject: net_sched: sch_htb: add per class overlimits counter HTB qdisc overlimits counter is properly increased, but we have no per class counter, meaning it is difficult to diagnose HTB problems. This patch adds this counter, visible in "tc -s class show dev eth0", with current iproute2. Signed-off-by: Eric Dumazet Reported-by: Denys Fedoryshchenko Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 7e148376ba52..c6d7ae81b41f 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -142,6 +142,7 @@ struct htb_class { struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ unsigned int drops ____cacheline_aligned_in_smp; + unsigned int overlimits; }; struct htb_level { @@ -533,6 +534,9 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) if (new_mode == cl->cmode) return; + if (new_mode == HTB_CANT_SEND) + cl->overlimits++; + if (cl->prio_activity) { /* not necessary: speed optimization */ if (cl->cmode != HTB_CANT_SEND) htb_deactivate_prios(q, cl); @@ -1143,6 +1147,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) struct htb_class *cl = (struct htb_class *)arg; struct gnet_stats_queue qs = { .drops = cl->drops, + .overlimits = cl->overlimits, }; __u32 qlen = 0; -- cgit v1.2.3 From 173f4c5ebbd803023e42799d956cf174dea92db5 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Mon, 18 Sep 2017 20:18:55 +0200 Subject: vsock: vmci: Remove unneeded linux/miscdevice.h include net/vmw_vsock/vmci_transport.c does not use any miscdevice so this patch remove this unnecessary inclusion. Signed-off-by: Corentin Labbe Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 10ae7823a19d..0206155bff53 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From bffa72cf7f9df842f0016ba03586039296b4caaf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 05:14:24 -0700 Subject: net: sk_buff rbnode reorg skb->rbnode shares space with skb->next, skb->prev and skb->tstamp Current uses (TCP receive ofo queue and netem) need to save/restore tstamp, while skb->dev is either NULL (TCP) or a constant for a given queue (netem). Since we plan using an RB tree for TCP retransmit queue to speedup SACK processing with large BDP, this patch exchanges skb->dev and skb->tstamp. This saves some overhead in both TCP and netem. v2: removes the swtstamp field from struct tcp_skb_cb Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Wei Wang Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 27 +++++---------------------- net/sched/sch_netem.c | 7 ++++--- 2 files changed, 9 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bddf724f5c02..db9bb46b5776 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4266,11 +4266,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } -enum tcp_queue { - OOO_QUEUE, - RCV_QUEUE, -}; - /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket @@ -4286,7 +4281,6 @@ enum tcp_queue { * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, - enum tcp_queue dest, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) @@ -4311,10 +4305,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; - if (dest == OOO_QUEUE) - TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp; - else - to->tstamp = from->tstamp; + to->tstamp = from->tstamp; } return true; @@ -4351,9 +4342,6 @@ static void tcp_ofo_queue(struct sock *sk) } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); - /* Replace tstamp which was stomped by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - skb->tstamp = TCP_SKB_CB(skb)->swtstamp; if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); @@ -4365,8 +4353,7 @@ static void tcp_ofo_queue(struct sock *sk) TCP_SKB_CB(skb)->end_seq); tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, - tail, skb, &fragstolen); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) @@ -4420,10 +4407,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } - /* Stash tstamp to avoid being stomped on by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - TCP_SKB_CB(skb)->swtstamp = skb->tstamp; - /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); @@ -4451,7 +4434,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); @@ -4502,7 +4485,7 @@ coalesce_done: __kfree_skb(skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { goto coalesce_done; } @@ -4554,7 +4537,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && - tcp_try_coalesce(sk, RCV_QUEUE, tail, + tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b1266e75ca43..063a4bdb9ee6 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -146,7 +146,6 @@ struct netem_sched_data { */ struct netem_skb_cb { psched_time_t time_to_send; - ktime_t tstamp_save; }; @@ -561,7 +560,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, } cb->time_to_send = now + delay; - cb->tstamp_save = skb->tstamp; ++q->counter; tfifo_enqueue(skb, sch); } else { @@ -629,7 +627,10 @@ deliver: qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; skb->prev = NULL; - skb->tstamp = netem_skb_cb(skb)->tstamp_save; + /* skb->dev shares skb->rbnode area, + * we need to restore its value. + */ + skb->dev = qdisc_dev(sch); #ifdef CONFIG_NET_CLS_ACT /* -- cgit v1.2.3 From f5619866592c65adc087364cc1a3ba709201ea26 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:56:57 -0400 Subject: net: dsa: remove copy of master ethtool_ops There is no need to store a copy of the master ethtool ops, storing the original pointer in DSA and the new one in the master netdev itself is enough. In the meantime, set orig_ethtool_ops to NULL when restoring the master ethtool ops and check the presence of the master original ethtool ops as well as its needed functions before calling them. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 8 ++++---- net/dsa/slave.c | 19 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 03c58b0eb082..abadf7b49236 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -124,11 +124,10 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) if (!cpu_ops) return -ENOMEM; - memcpy(&cpu_dp->ethtool_ops, master->ethtool_ops, - sizeof(struct ethtool_ops)); cpu_dp->orig_ethtool_ops = master->ethtool_ops; - memcpy(cpu_ops, &cpu_dp->ethtool_ops, - sizeof(struct ethtool_ops)); + if (cpu_dp->orig_ethtool_ops) + memcpy(cpu_ops, cpu_dp->orig_ethtool_ops, sizeof(*cpu_ops)); + dsa_cpu_port_ethtool_init(cpu_ops); master->ethtool_ops = cpu_ops; @@ -138,6 +137,7 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp) { cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops; + cpu_dp->orig_ethtool_ops = NULL; } void dsa_cpu_dsa_destroy(struct dsa_port *port) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2afa99506f8b..2ff4f907d137 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -574,12 +574,13 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); struct dsa_switch *ds = cpu_dp->ds; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; s8 cpu_port = cpu_dp->index; int count = 0; - if (cpu_dp->ethtool_ops.get_sset_count) { - count = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS); - cpu_dp->ethtool_ops.get_ethtool_stats(dev, stats, data); + if (ops && ops->get_sset_count && ops->get_ethtool_stats) { + count = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_ethtool_stats(dev, stats, data); } if (ds->ops->get_ethtool_stats) @@ -591,10 +592,11 @@ static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); struct dsa_switch *ds = cpu_dp->ds; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; int count = 0; - if (cpu_dp->ethtool_ops.get_sset_count) - count += cpu_dp->ethtool_ops.get_sset_count(dev, sset); + if (ops && ops->get_sset_count) + count += ops->get_sset_count(dev, sset); if (sset == ETH_SS_STATS && ds->ops->get_sset_count) count += ds->ops->get_sset_count(ds); @@ -608,6 +610,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); struct dsa_switch *ds = cpu_dp->ds; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; s8 cpu_port = cpu_dp->index; int len = ETH_GSTRING_LEN; int mcount = 0, count; @@ -619,9 +622,9 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; - if (cpu_dp->ethtool_ops.get_sset_count) { - mcount = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS); - cpu_dp->ethtool_ops.get_strings(dev, stringset, data); + if (ops && ops->get_sset_count && ops->get_strings) { + mcount = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_strings(dev, stringset, data); } if (stringset == ETH_SS_STATS && ds->ops->get_strings) { -- cgit v1.2.3 From cd8d7dd41bfddaa02e34db35abfab14ce4584dee Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:56:58 -0400 Subject: net: dsa: setup master ethtool unconditionally When a DSA switch tree is meant to be applied, it already has a CPU port. Thus remove the condition of dst->cpu_dp. Moreover, the next lines access dst->cpu_dp unconditionally. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 873af0108e24..bd19304f862f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -433,11 +433,9 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - if (dst->cpu_dp) { - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); - if (err) - return err; - } + err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); + if (err) + return err; /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get @@ -474,10 +472,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - if (dst->cpu_dp) { - dsa_cpu_port_ethtool_restore(dst->cpu_dp); - dst->cpu_dp = NULL; - } + dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dst->cpu_dp = NULL; pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; -- cgit v1.2.3 From 1943563dfd4b3a1f9dc102f056813112d29bb60f Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:56:59 -0400 Subject: net: dsa: setup master ethtool after dsa_ptr DSA overrides the master's ethtool ops so that we can inject its CPU port's statistics. Because of that, we need to setup the ethtool ops after the master's dsa_ptr pointer has been assigned, not before. This patch setups the ethtool ops after dsa_ptr is assigned, and restores them before it gets cleared. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 12 +++++++----- net/dsa/legacy.c | 10 +++------- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index bd19304f862f..032f8bc3e788 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -433,16 +433,17 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); - if (err) - return err; - /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. */ wmb(); dst->cpu_dp->netdev->dsa_ptr = dst; + + err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); + if (err) + return err; + dst->applied = true; return 0; @@ -456,6 +457,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; + dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dst->cpu_dp->netdev->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype @@ -472,7 +475,6 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_dp); dst->cpu_dp = NULL; pr_info("DSA: tree %d unapplied\n", dst->tree); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 91e6f7981d39..163910699db7 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -206,10 +206,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, netdev_err(master, "[%d] : can't configure CPU and DSA ports\n", index); - ret = dsa_cpu_port_ethtool_setup(ds->dst->cpu_dp); - if (ret) - return ret; - return 0; } @@ -606,7 +602,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, wmb(); dev->dsa_ptr = dst; - return 0; + return dsa_cpu_port_ethtool_setup(dst->cpu_dp); } static int dsa_probe(struct platform_device *pdev) @@ -671,6 +667,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; + dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dst->cpu_dp->netdev->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype @@ -686,8 +684,6 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_dp); - dev_put(dst->cpu_dp->netdev); } -- cgit v1.2.3 From f2f2356685bcaf1063859356fc65a5ac808b1382 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:57:00 -0400 Subject: net: dsa: move master ethtool code DSA overrides the master device ethtool ops, so that it can inject stats from its dedicated switch CPU port as well. The related code is currently split in dsa.c and slave.c, but it only scopes the master net device. Move it to a new master.c DSA core file. This file will be later extented with master net device specific code. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/Makefile | 2 +- net/dsa/dsa.c | 28 ------------- net/dsa/dsa2.c | 4 +- net/dsa/dsa_priv.h | 7 ++-- net/dsa/legacy.c | 4 +- net/dsa/master.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 83 ------------------------------------ 7 files changed, 129 insertions(+), 119 deletions(-) create mode 100644 net/dsa/master.c (limited to 'net') diff --git a/net/dsa/Makefile b/net/dsa/Makefile index fcce25da937c..2e7ac8bab19d 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o legacy.o port.o slave.o switch.o +dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index abadf7b49236..81c852e32821 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -112,34 +112,6 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) return ops; } -int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) -{ - struct dsa_switch *ds = cpu_dp->ds; - struct net_device *master; - struct ethtool_ops *cpu_ops; - - master = cpu_dp->netdev; - - cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL); - if (!cpu_ops) - return -ENOMEM; - - cpu_dp->orig_ethtool_ops = master->ethtool_ops; - if (cpu_dp->orig_ethtool_ops) - memcpy(cpu_ops, cpu_dp->orig_ethtool_ops, sizeof(*cpu_ops)); - - dsa_cpu_port_ethtool_init(cpu_ops); - master->ethtool_ops = cpu_ops; - - return 0; -} - -void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp) -{ - cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops; - cpu_dp->orig_ethtool_ops = NULL; -} - void dsa_cpu_dsa_destroy(struct dsa_port *port) { struct device_node *port_dn = port->dn; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 032f8bc3e788..dcccaebde708 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -440,7 +440,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) wmb(); dst->cpu_dp->netdev->dsa_ptr = dst; - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); + err = dsa_master_ethtool_setup(dst->cpu_dp->netdev); if (err) return err; @@ -457,7 +457,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; - dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dsa_master_ethtool_restore(dst->cpu_dp->netdev); dst->cpu_dp->netdev->dsa_ptr = NULL; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9c3eeb72462d..f616b3444418 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -97,8 +97,6 @@ struct dsa_slave_priv { int dsa_cpu_dsa_setup(struct dsa_port *port); void dsa_cpu_dsa_destroy(struct dsa_port *dport); const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); -int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp); -void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp); bool dsa_schedule_work(struct work_struct *work); /* legacy.c */ @@ -112,6 +110,10 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); +/* master.c */ +int dsa_master_ethtool_setup(struct net_device *dev); +void dsa_master_ethtool_restore(struct net_device *dev); + /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); @@ -139,7 +141,6 @@ int dsa_port_vlan_del(struct dsa_port *dp, /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); -void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops); int dsa_slave_create(struct dsa_port *port, const char *name); void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 163910699db7..ae505d8e4417 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -602,7 +602,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, wmb(); dev->dsa_ptr = dst; - return dsa_cpu_port_ethtool_setup(dst->cpu_dp); + return dsa_master_ethtool_setup(dst->cpu_dp->netdev); } static int dsa_probe(struct platform_device *pdev) @@ -667,7 +667,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; - dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dsa_master_ethtool_restore(dst->cpu_dp->netdev); dst->cpu_dp->netdev->dsa_ptr = NULL; diff --git a/net/dsa/master.c b/net/dsa/master.c new file mode 100644 index 000000000000..5e5147ec5a44 --- /dev/null +++ b/net/dsa/master.c @@ -0,0 +1,120 @@ +/* + * Handling of a master device, switching frames via its switch fabric CPU port + * + * Copyright (c) 2017 Savoir-faire Linux Inc. + * Vivien Didelot + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include "dsa_priv.h" + +static void dsa_master_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, + uint64_t *data) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + struct dsa_switch *ds = port->ds; + const struct ethtool_ops *ops = port->orig_ethtool_ops; + int count = 0; + + if (ops && ops->get_sset_count && ops->get_ethtool_stats) { + count = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_ethtool_stats(dev, stats, data); + } + + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, port->index, data + count); +} + +static int dsa_master_get_sset_count(struct net_device *dev, int sset) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + struct dsa_switch *ds = port->ds; + const struct ethtool_ops *ops = port->orig_ethtool_ops; + int count = 0; + + if (ops && ops->get_sset_count) + count += ops->get_sset_count(dev, sset); + + if (sset == ETH_SS_STATS && ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds); + + return count; +} + +static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, + uint8_t *data) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + struct dsa_switch *ds = port->ds; + const struct ethtool_ops *ops = port->orig_ethtool_ops; + int len = ETH_GSTRING_LEN; + int mcount = 0, count; + unsigned int i; + uint8_t pfx[4]; + uint8_t *ndata; + + snprintf(pfx, sizeof(pfx), "p%.2d", port->index); + /* We do not want to be NULL-terminated, since this is a prefix */ + pfx[sizeof(pfx) - 1] = '_'; + + if (ops && ops->get_sset_count && ops->get_strings) { + mcount = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_strings(dev, stringset, data); + } + + if (stringset == ETH_SS_STATS && ds->ops->get_strings) { + ndata = data + mcount * len; + /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle + * the output after to prepend our CPU port prefix we + * constructed earlier + */ + ds->ops->get_strings(ds, port->index, ndata); + count = ds->ops->get_sset_count(ds); + for (i = 0; i < count; i++) { + memmove(ndata + (i * len + sizeof(pfx)), + ndata + i * len, len - sizeof(pfx)); + memcpy(ndata + i * len, pfx, sizeof(pfx)); + } + } +} + +int dsa_master_ethtool_setup(struct net_device *dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + struct dsa_switch *ds = port->ds; + struct ethtool_ops *ops; + + ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + port->orig_ethtool_ops = dev->ethtool_ops; + if (port->orig_ethtool_ops) + memcpy(ops, port->orig_ethtool_ops, sizeof(*ops)); + + ops->get_sset_count = dsa_master_get_sset_count; + ops->get_ethtool_stats = dsa_master_get_ethtool_stats; + ops->get_strings = dsa_master_get_strings; + + dev->ethtool_ops = ops; + + return 0; +} + +void dsa_master_ethtool_restore(struct net_device *dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + + dev->ethtool_ops = port->orig_ethtool_ops; + port->orig_ethtool_ops = NULL; +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2ff4f907d137..d51b10450e1b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -567,82 +567,6 @@ static void dsa_slave_get_strings(struct net_device *dev, } } -static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, - uint64_t *data) -{ - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; - const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - s8 cpu_port = cpu_dp->index; - int count = 0; - - if (ops && ops->get_sset_count && ops->get_ethtool_stats) { - count = ops->get_sset_count(dev, ETH_SS_STATS); - ops->get_ethtool_stats(dev, stats, data); - } - - if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, cpu_port, data + count); -} - -static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) -{ - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; - const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - int count = 0; - - if (ops && ops->get_sset_count) - count += ops->get_sset_count(dev, sset); - - if (sset == ETH_SS_STATS && ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds); - - return count; -} - -static void dsa_cpu_port_get_strings(struct net_device *dev, - uint32_t stringset, uint8_t *data) -{ - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; - const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - s8 cpu_port = cpu_dp->index; - int len = ETH_GSTRING_LEN; - int mcount = 0, count; - unsigned int i; - uint8_t pfx[4]; - uint8_t *ndata; - - snprintf(pfx, sizeof(pfx), "p%.2d", cpu_port); - /* We do not want to be NULL-terminated, since this is a prefix */ - pfx[sizeof(pfx) - 1] = '_'; - - if (ops && ops->get_sset_count && ops->get_strings) { - mcount = ops->get_sset_count(dev, ETH_SS_STATS); - ops->get_strings(dev, stringset, data); - } - - if (stringset == ETH_SS_STATS && ds->ops->get_strings) { - ndata = data + mcount * len; - /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle - * the output after to prepend our CPU port prefix we - * constructed earlier - */ - ds->ops->get_strings(ds, cpu_port, ndata); - count = ds->ops->get_sset_count(ds); - for (i = 0; i < count; i++) { - memmove(ndata + (i * len + sizeof(pfx)), - ndata + i * len, len - sizeof(pfx)); - memcpy(ndata + i * len, pfx, sizeof(pfx)); - } - } -} - static void dsa_slave_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) @@ -979,13 +903,6 @@ static void dsa_slave_get_stats64(struct net_device *dev, } } -void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) -{ - ops->get_sset_count = dsa_cpu_port_get_sset_count; - ops->get_ethtool_stats = dsa_cpu_port_get_ethtool_stats; - ops->get_strings = dsa_cpu_port_get_strings; -} - static int dsa_slave_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { -- cgit v1.2.3 From 752fbcc33405d6f8249465e4b2c4e420091bb825 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 19 Sep 2017 13:15:42 -0700 Subject: net_sched: no need to free qdisc in RCU callback gen estimator has been rewritten in commit 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators"), the caller no longer needs to wait for a grace period. So this patch gets rid of it. Cc: Jamal Hadi Salim Cc: Eric Dumazet Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 92237e75dbbc..1fb0c754b7fd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -688,10 +688,8 @@ void qdisc_reset(struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_reset); -static void qdisc_rcu_free(struct rcu_head *head) +static void qdisc_free(struct Qdisc *qdisc) { - struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); - if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); free_percpu(qdisc->cpu_qstats); @@ -724,11 +722,7 @@ void qdisc_destroy(struct Qdisc *qdisc) kfree_skb_list(qdisc->gso_skb); kfree_skb(qdisc->skb_bad_txq); - /* - * gen_estimator est_timer() might access qdisc->q.lock, - * wait a RCU grace period before freeing qdisc. - */ - call_rcu(&qdisc->rcu_head, qdisc_rcu_free); + qdisc_free(qdisc); } EXPORT_SYMBOL(qdisc_destroy); -- cgit v1.2.3 From a90c9347e90ed1e9323d71402ed18023bc910cd8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:06 -0700 Subject: ipv6: addrlabel: per netns list Having a global list of labels do not scale to thousands of netns in the cloud era. This causes quadratic behavior on netns creation and deletion. This is time having a per netns list of ~10 labels. Tested: $ time perf record (for f in `seq 1 3000` ; do ip netns add tast$f; done) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 3.637 MB perf.data (~158898 samples) ] real 0m20.837s # instead of 0m24.227s user 0m0.328s sys 0m20.338s # instead of 0m23.753s 16.17% ip [kernel.kallsyms] [k] netlink_broadcast_filtered 12.30% ip [kernel.kallsyms] [k] netlink_has_listeners 6.76% ip [kernel.kallsyms] [k] _raw_spin_lock_irqsave 5.78% ip [kernel.kallsyms] [k] memset_erms 5.77% ip [kernel.kallsyms] [k] kobject_uevent_env 5.18% ip [kernel.kallsyms] [k] refcount_sub_and_test 4.96% ip [kernel.kallsyms] [k] _raw_read_lock 3.82% ip [kernel.kallsyms] [k] refcount_inc_not_zero 3.33% ip [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 2.11% ip [kernel.kallsyms] [k] unmap_page_range 1.77% ip [kernel.kallsyms] [k] __wake_up 1.69% ip [kernel.kallsyms] [k] strlen 1.17% ip [kernel.kallsyms] [k] __wake_up_common 1.09% ip [kernel.kallsyms] [k] insert_header 1.04% ip [kernel.kallsyms] [k] page_remove_rmap 1.01% ip [kernel.kallsyms] [k] consume_skb 0.98% ip [kernel.kallsyms] [k] netlink_trim 0.51% ip [kernel.kallsyms] [k] kernfs_link_sibling 0.51% ip [kernel.kallsyms] [k] filemap_map_pages 0.46% ip [kernel.kallsyms] [k] memcpy_erms Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 81 +++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index b055bc79f56d..c6311d7108f6 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -30,7 +30,6 @@ * Policy Table */ struct ip6addrlbl_entry { - possible_net_t lbl_net; struct in6_addr prefix; int prefixlen; int ifindex; @@ -41,19 +40,6 @@ struct ip6addrlbl_entry { struct rcu_head rcu; }; -static struct ip6addrlbl_table -{ - struct hlist_head head; - spinlock_t lock; - u32 seq; -} ip6addrlbl_table; - -static inline -struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) -{ - return read_pnet(&lbl->lbl_net); -} - /* * Default policy table (RFC6724 + extensions) * @@ -148,13 +134,10 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) } /* Find label */ -static bool __ip6addrlbl_match(struct net *net, - const struct ip6addrlbl_entry *p, +static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, const struct in6_addr *addr, int addrtype, int ifindex) { - if (!net_eq(ip6addrlbl_net(p), net)) - return false; if (p->ifindex && p->ifindex != ifindex) return false; if (p->addrtype && p->addrtype != addrtype) @@ -169,8 +152,9 @@ static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, int type, int ifindex) { struct ip6addrlbl_entry *p; - hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { - if (__ip6addrlbl_match(net, p, addr, type, ifindex)) + + hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { + if (__ip6addrlbl_match(p, addr, type, ifindex)) return p; } return NULL; @@ -196,8 +180,7 @@ u32 ipv6_addr_label(struct net *net, } /* allocate one entry */ -static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, - const struct in6_addr *prefix, +static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, int prefixlen, int ifindex, u32 label) { @@ -236,24 +219,23 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); - write_pnet(&newp->lbl_net, net); refcount_set(&newp->refcnt, 1); return newp; } /* add a label */ -static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) +static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, + int replace) { - struct hlist_node *n; struct ip6addrlbl_entry *last = NULL, *p = NULL; + struct hlist_node *n; int ret = 0; ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, replace); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == newp->prefixlen && - net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && p->ifindex == newp->ifindex && ipv6_addr_equal(&p->prefix, &newp->prefix)) { if (!replace) { @@ -273,10 +255,10 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) if (last) hlist_add_behind_rcu(&newp->list, &last->list); else - hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); + hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) - ip6addrlbl_table.seq++; + net->ipv6.ip6addrlbl_table.seq++; return ret; } @@ -292,12 +274,12 @@ static int ip6addrlbl_add(struct net *net, __func__, prefix, prefixlen, ifindex, (unsigned int)label, replace); - newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label); + newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); if (IS_ERR(newp)) return PTR_ERR(newp); - spin_lock(&ip6addrlbl_table.lock); - ret = __ip6addrlbl_add(newp, replace); - spin_unlock(&ip6addrlbl_table.lock); + spin_lock(&net->ipv6.ip6addrlbl_table.lock); + ret = __ip6addrlbl_add(net, newp, replace); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); if (ret) ip6addrlbl_free(newp); return ret; @@ -315,9 +297,8 @@ static int __ip6addrlbl_del(struct net *net, ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, prefixlen, ifindex); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && - net_eq(ip6addrlbl_net(p), net) && p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); @@ -340,9 +321,9 @@ static int ip6addrlbl_del(struct net *net, __func__, prefix, prefixlen, ifindex); ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); - spin_lock(&ip6addrlbl_table.lock); + spin_lock(&net->ipv6.ip6addrlbl_table.lock); ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); - spin_unlock(&ip6addrlbl_table.lock); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); return ret; } @@ -354,6 +335,9 @@ static int __net_init ip6addrlbl_net_init(struct net *net) ADDRLABEL(KERN_DEBUG "%s\n", __func__); + spin_lock_init(&net->ipv6.ip6addrlbl_table.lock); + INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); + for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { int ret = ip6addrlbl_add(net, ip6addrlbl_init_table[i].prefix, @@ -373,14 +357,12 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) struct hlist_node *n; /* Remove all labels belonging to the exiting net */ - spin_lock(&ip6addrlbl_table.lock); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { - if (net_eq(ip6addrlbl_net(p), net)) { - hlist_del_rcu(&p->list); - ip6addrlbl_put(p); - } + spin_lock(&net->ipv6.ip6addrlbl_table.lock); + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); } - spin_unlock(&ip6addrlbl_table.lock); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); } static struct pernet_operations ipv6_addr_label_ops = { @@ -390,8 +372,6 @@ static struct pernet_operations ipv6_addr_label_ops = { int __init ipv6_addr_label_init(void) { - spin_lock_init(&ip6addrlbl_table.lock); - return register_pernet_subsys(&ipv6_addr_label_ops); } @@ -510,11 +490,10 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) int err; rcu_read_lock(); - hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { - if (idx >= s_idx && - net_eq(ip6addrlbl_net(p), net)) { + hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { + if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, - ip6addrlbl_table.seq, + net->ipv6.ip6addrlbl_table.seq, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDRLABEL, @@ -571,7 +550,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); if (p && !ip6addrlbl_hold(p)) p = NULL; - lseq = ip6addrlbl_table.seq; + lseq = net->ipv6.ip6addrlbl_table.seq; rcu_read_unlock(); if (!p) { -- cgit v1.2.3 From 789e6ddb0b2fb5d5024b760b178a47876e4de7a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:07 -0700 Subject: tcp: batch tcp_net_metrics_exit When dealing with a list of dismantling netns, we can scan tcp_metrics once, saving cpu cycles. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 102b2c90bb80..0ab78abc811b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -892,10 +892,14 @@ static void tcp_metrics_flush_all(struct net *net) for (row = 0; row < max_rows; row++, hb++) { struct tcp_metrics_block __rcu **pp; + bool match; + spin_lock_bh(&tcp_metrics_lock); pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { - if (net_eq(tm_net(tm), net)) { + match = net ? net_eq(tm_net(tm), net) : + !atomic_read(&tm_net(tm)->count); + if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); } else { @@ -1018,14 +1022,14 @@ static int __net_init tcp_net_metrics_init(struct net *net) return 0; } -static void __net_exit tcp_net_metrics_exit(struct net *net) +static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) { - tcp_metrics_flush_all(net); + tcp_metrics_flush_all(NULL); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { - .init = tcp_net_metrics_init, - .exit = tcp_net_metrics_exit, + .init = tcp_net_metrics_init, + .exit_batch = tcp_net_metrics_exit_batch, }; void __init tcp_metrics_init(void) -- cgit v1.2.3 From bb401caefe9d2c65e0c0fa23b21deecfbfa473fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:08 -0700 Subject: ipv6: speedup ipv6 tunnels dismantle Implement exit_batch() method to dismantle more devices per round. (rtnl_lock() ... unregister_netdevice_many() ... rtnl_unlock()) Tested: $ cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait ; grep net_namespace /proc/slabinfo Before patch : $ time ./add_del_unshare.sh net_namespace 110 267 5504 1 2 : tunables 8 4 0 : slabdata 110 267 0 real 3m25.292s user 0m0.644s sys 0m40.153s After patch: $ time ./add_del_unshare.sh net_namespace 126 282 5504 1 2 : tunables 8 4 0 : slabdata 126 282 0 real 1m38.965s user 0m0.688s sys 0m37.017s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 8 +++++--- net/ipv6/ip6_tunnel.c | 20 +++++++++++--------- net/ipv6/ip6_vti.c | 23 ++++++++++++++--------- net/ipv6/sit.c | 9 ++++++--- 4 files changed, 36 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b7a72d409334..c82d41ef25e2 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1155,19 +1155,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_net(struct net *net) +static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list) { + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip6gre_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + ip6gre_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit = ip6gre_exit_net, + .exit_batch = ip6gre_exit_batch_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ae73164559d5..3d6df489b39f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2167,17 +2167,16 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) +static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; - LIST_HEAD(list); for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) - unregister_netdevice_queue(dev, &list); + unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); @@ -2186,12 +2185,10 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } - - unregister_netdevice_many(&list); } static int __net_init ip6_tnl_init_net(struct net *net) @@ -2235,16 +2232,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_net(struct net *net) +static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list) { + struct net *net; + LIST_HEAD(list); + rtnl_lock(); - ip6_tnl_destroy_tunnels(net); + list_for_each_entry(net, net_list, exit_list) + ip6_tnl_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit = ip6_tnl_exit_net, + .exit_batch = ip6_tnl_exit_batch_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 79444a4bfd6d..714914d1bb98 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1052,23 +1052,22 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) +static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, + struct list_head *list) { int h; struct ip6_tnl *t; - LIST_HEAD(list); for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); - unregister_netdevice_queue(t->dev, &list); - unregister_netdevice_many(&list); + unregister_netdevice_queue(t->dev, list); } static int __net_init vti6_init_net(struct net *net) @@ -1108,18 +1107,24 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_net(struct net *net) +static void __net_exit vti6_exit_batch_net(struct list_head *net_list) { - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n; + struct net *net; + LIST_HEAD(list); rtnl_lock(); - vti6_destroy_tunnels(ip6n); + list_for_each_entry(net, net_list, exit_list) { + ip6n = net_generic(net, vti6_net_id); + vti6_destroy_tunnels(ip6n, &list); + } + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit = vti6_exit_net, + .exit_batch = vti6_exit_batch_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index ac912bb21747..a799f5258614 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1848,19 +1848,22 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_net(struct net *net) +static void __net_exit sit_exit_batch_net(struct list_head *net_list) { LIST_HEAD(list); + struct net *net; rtnl_lock(); - sit_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + sit_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit = sit_exit_net, + .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; -- cgit v1.2.3 From 64bc17811b72758753e2b64cd8f2a63812c61fe1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:09 -0700 Subject: ipv4: speedup ipv6 tunnels dismantle Implement exit_batch() method to dismantle more devices per round. (rtnl_lock() ... unregister_netdevice_many() ... rtnl_unlock()) Tested: $ cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait ; grep net_namespace /proc/slabinfo Before patch : $ time ./add_del_unshare.sh net_namespace 126 282 5504 1 2 : tunables 8 4 0 : slabdata 126 282 0 real 1m38.965s user 0m0.688s sys 0m37.017s After patch: $ time ./add_del_unshare.sh net_namespace 135 291 5504 1 2 : tunables 8 4 0 : slabdata 135 291 0 real 0m22.117s user 0m0.728s sys 0m35.328s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 22 +++++++++------------- net/ipv4/ip_tunnel.c | 12 +++++++++--- net/ipv4/ip_vti.c | 7 +++---- net/ipv4/ipip.c | 7 +++---- 4 files changed, 24 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0162fb955b33..9cee986ac6b8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1013,15 +1013,14 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); - ip_tunnel_delete_net(itn, &ipgre_link_ops); + ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit = ipgre_exit_net, + .exit_batch = ipgre_exit_batch_net, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1540,15 +1539,14 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_net(struct net *net) +static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - ip_tunnel_delete_net(itn, &ipgre_tap_ops); + ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit = ipgre_tap_exit_net, + .exit_batch = ipgre_tap_exit_batch_net, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1559,16 +1557,14 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_net(struct net *net) +static void __net_exit erspan_exit_batch_net(struct list_head *net_list) { - struct ip_tunnel_net *itn = net_generic(net, erspan_net_id); - - ip_tunnel_delete_net(itn, &erspan_link_ops); + ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit = erspan_exit_net, + .exit_batch = erspan_exit_batch_net, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e9805ad664ac..fe6fee728ce4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1061,16 +1061,22 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, } } -void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) +void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, + struct rtnl_link_ops *ops) { + struct ip_tunnel_net *itn; + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip_tunnel_destroy(itn, &list, ops); + list_for_each_entry(net, net_list, exit_list) { + itn = net_generic(net, id); + ip_tunnel_destroy(itn, &list, ops); + } unregister_netdevice_many(&list); rtnl_unlock(); } -EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ed63d250950..02d70ca99db1 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -452,15 +452,14 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_net(struct net *net) +static void __net_exit vti_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - ip_tunnel_delete_net(itn, &vti_link_ops); + ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit = vti_exit_net, + .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fb1ad22b5e29..1e47818e38c7 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -634,15 +634,14 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_net(struct net *net) +static void __net_exit ipip_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); - ip_tunnel_delete_net(itn, &ipip_link_ops); + ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit = ipip_exit_net, + .exit_batch = ipip_exit_batch_net, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; -- cgit v1.2.3 From 91f4aa977947f046dc144fa9e3b06f0ffb53be79 Mon Sep 17 00:00:00 2001 From: Diogenes Pereira Date: Wed, 9 Aug 2017 13:19:24 -0300 Subject: mac802154: replace hardcoded value with macro Use IEEE802154_SCF_SECLEVEL_NONE macro defined at ieee802154.h file. Signed-off-by: Diogenes Pereira Signed-off-by: Stefan Schmidt --- net/mac802154/llsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index 1e1c9b20bab7..edec2f9919d0 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -713,7 +713,8 @@ int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb) if (hlen < 0 || hdr.fc.type != IEEE802154_FC_TYPE_DATA) return -EINVAL; - if (!hdr.fc.security_enabled || hdr.sec.level == 0) { + if (!hdr.fc.security_enabled || + (hdr.sec.level == IEEE802154_SCF_SECLEVEL_NONE)) { skb_push(skb, hlen); return 0; } -- cgit v1.2.3 From 3e4962667efb0f6c09fa3111e6ee53838118b227 Mon Sep 17 00:00:00 2001 From: Diogenes Pereira Date: Tue, 5 Sep 2017 09:18:04 -0300 Subject: mac802154: Fix MAC header and payload encrypted According to 802.15.4-2003/2006/2015 specifications the MAC frame is composed of MHR, MAC payload and MFR and just the outgoing MAC payload must be encrypted. If communication is secure,sender build Auxiliary Security Header(ASH), insert it next to the standard MHR header with security enabled bit ON, and secure frames before transmitting them. According to the information carried within the ASH, recipient retrieves the right cryptographic key and correctly un-secure MAC frames. The error scenario occurs on Linux using IEEE802154_SCF_SECLEVEL_ENC(4) security level when llsec_do_encrypt_unauth() function builds theses MAC frames incorrectly. On recipients these MAC frames are discarded,logging "got invalid frame" messages. Signed-off-by: Diogenes Pereira Signed-off-by: Stefan Schmidt --- net/mac802154/llsec.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index edec2f9919d0..2fb703d70803 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -623,13 +623,18 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, u8 iv[16]; struct scatterlist src; SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); - int err; + int err, datalen; + unsigned char *data; llsec_geniv(iv, sec->params.hwaddr, &hdr->sec); - sg_init_one(&src, skb->data, skb->len); + /* Compute data payload offset and data length */ + data = skb_mac_header(skb) + skb->mac_len; + datalen = skb_tail_pointer(skb) - data; + sg_init_one(&src, data, datalen); + skcipher_request_set_tfm(req, key->tfm0); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &src, &src, skb->len, iv); + skcipher_request_set_crypt(req, &src, &src, datalen, iv); err = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return err; -- cgit v1.2.3 From d4e1b299ec2853dd3d90b71ae86fa2626e60dc25 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 20 Sep 2017 12:18:17 -0400 Subject: ipv6: Use ipv6_authlen for len in ipv6_skip_exthdr In ipv6_skip_exthdr, the lengh of AH header is computed manually as (hp->hdrlen+2)<<2. However, in include/linux/ipv6.h, a macro named ipv6_authlen is already defined for exactly the same job. This commit replaces the manual computation code with the macro. Signed-off-by: Xiang Gao Signed-off-by: David S. Miller --- net/ipv6/exthdrs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 305e2ed730bf..115d60919f72 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -99,7 +99,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hp->hdrlen+2)<<2; + hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); -- cgit v1.2.3 From 00ba4cb36da682c68dc87d1703a8aaffe2b4e9c5 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Sat, 16 Sep 2017 16:18:33 +0200 Subject: bridge: also trigger RTM_NEWLINK when interface is released from bridge Currently, when an interface is released from a bridge via ioctl(), we get a RTM_DELLINK event through netlink: Deleted 2: dummy0: mtu 1500 master bridge0 state UNKNOWN link/ether 6e:23:c2:54:3a:b3 Userspace has to interpret that as a removal from the bridge, not as a complete removal of the interface. When an bridged interface is completely removed, we get two events: Deleted 2: dummy0: mtu 1500 master bridge0 state DOWN link/ether 6e:23:c2:54:3a:b3 Deleted 2: dummy0: mtu 1500 qdisc noop state DOWN group default link/ether 6e:23:c2:54:3a:b3 brd ff:ff:ff:ff:ff:ff In constrast, when an interface is released from a bond, we get a RTM_NEWLINK with only the new characteristics (no master): 3: dummy1: mtu 1500 qdisc noqueue master bond0 state UNKNOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 4: bond0: mtu 1500 qdisc noqueue state UP group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: mtu 1500 qdisc noqueue state DOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: mtu 1500 qdisc noqueue state DOWN group default link/ether ca:c8:7b:66:f8:25 brd ff:ff:ff:ff:ff:ff 4: bond0: mtu 1500 qdisc noqueue state UP group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff Userland may be confused by the fact we say a link is deleted while its characteristics are only modified. A first solution would have been to turn the RTM_DELLINK event in del_nbp() into a RTM_NEWLINK event. However, maybe some piece of userland is relying on this RTM_DELLINK to detect when a bridged interface is released. Instead, we also emit a RTM_NEWLINK event once the interface is released (without master info). Deleted 2: dummy0: mtu 1500 master bridge0 state UNKNOWN link/ether 8a:bb:e7:94:b1:f8 2: dummy0: mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 8a:bb:e7:94:b1:f8 brd ff:ff:ff:ff:ff:ff This is done only when using ioctl(). When using Netlink, such an event is already automatically emitted in do_setlink(). Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 7970f8540cbb..3148cb3a8e82 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -99,8 +99,10 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) if (isadd) ret = br_add_if(br, dev); - else + else { ret = br_del_if(br, dev); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); + } return ret; } -- cgit v1.2.3 From 0d4a6608f68c7532dcbfec2ea1150c9761767d03 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 19 Sep 2017 12:11:43 +0200 Subject: udp: do rmem bulk free even if the rx sk queue is empty The commit 6b229cf77d68 ("udp: add batching to udp_rmem_release()") reduced greatly the cacheline contention between the BH and the US reader batching the rmem updates in most scenarios. Such optimization is explicitly avoided if the US reader is faster then BH processing. My fault, I initially suggested this kind of behavior due to concerns of possible regressions with small sk_rcvbuf values. Tests showed such concerns are misplaced, so this commit relaxes the condition for rmem bulk updates, obtaining small but measurable performance gain in the scenario described above. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef29df8648e4..784ced0b9150 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1212,8 +1212,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2) && - !skb_queue_empty(&up->reader_queue)) + if (size < (sk->sk_rcvbuf >> 2)) return; } else { size += up->forward_deficit; -- cgit v1.2.3 From eccaa9e51b77e504fa8e8357e4979bde724e22a8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 20 Sep 2017 15:39:59 -0700 Subject: Revert "bridge: also trigger RTM_NEWLINK when interface is released from bridge" This reverts commit 00ba4cb36da682c68dc87d1703a8aaffe2b4e9c5. Discussion with David Ahern determined that this change is actually not needed. Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 3148cb3a8e82..7970f8540cbb 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -99,10 +99,8 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) if (isadd) ret = br_add_if(br, dev); - else { + else ret = br_del_if(br, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); - } return ret; } -- cgit v1.2.3 From 53bade8a3372eea0f1276bc96892735c517330fe Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 18:00:37 -0700 Subject: net: dsa: Utilize dsa_slave_dev_check() Instead of open coding the check. Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d51b10450e1b..6fc9eb094267 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1294,7 +1294,7 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (dev->netdev_ops != &dsa_slave_netdev_ops) + if (!dsa_slave_dev_check(dev)) return NOTIFY_DONE; if (event == NETDEV_CHANGEUPPER) -- cgit v1.2.3 From 79af1f866193de29e65a4dba7d0dab14b0c0ff93 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 22 Jun 2017 12:20:29 +0200 Subject: mac80211: avoid allocating TXQs that won't be used For AP_VLAN and monitor interfaces we'll never use the TXQs we allocated, so avoid doing so. Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index f75029abf728..2619daa29961 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1772,7 +1772,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, sizeof(void *)); int txq_size = 0; - if (local->ops->wake_tx_queue) + if (local->ops->wake_tx_queue && + type != NL80211_IFTYPE_AP_VLAN && + type != NL80211_IFTYPE_MONITOR) txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; -- cgit v1.2.3 From 2512b1b18d0748d867bb22387db7c86b903291ad Mon Sep 17 00:00:00 2001 From: Liad Kaufman Date: Sat, 5 Aug 2017 11:44:31 +0300 Subject: mac80211: extend ieee80211_ie_split to support EXTENSION Current ieee80211_ie_split() implementation doesn't account for elements that are sub-elements of the EXTENSION IE. To extend support to these IEs as well, treat the WLAN_EID_EXTENSION ids in the %ids array as indicating that the next id in the array is a sub-element of the EXTENSION IE. Signed-off-by: Liad Kaufman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/util.c | 54 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index bcb1284c3415..4aab793c2f00 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1367,13 +1367,29 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, } EXPORT_SYMBOL(cfg80211_get_p2p_attr); -static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id) +static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext) { int i; - for (i = 0; i < n_ids; i++) - if (ids[i] == id) + /* Make sure array values are legal */ + if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION)) + return false; + + i = 0; + while (i < n_ids) { + if (ids[i] == WLAN_EID_EXTENSION) { + if (id_ext && (ids[i + 1] == id)) + return true; + + i += 2; + continue; + } + + if (ids[i] == id && !id_ext) return true; + + i++; + } return false; } @@ -1403,14 +1419,36 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, { size_t pos = offset; - while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) { + while (pos < ielen) { + u8 ext = 0; + + if (ies[pos] == WLAN_EID_EXTENSION) + ext = 2; + if ((pos + ext) >= ielen) + break; + + if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext], + ies[pos] == WLAN_EID_EXTENSION)) + break; + if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { pos = skip_ie(ies, ielen, pos); - while (pos < ielen && - !ieee80211_id_in_list(after_ric, n_after_ric, - ies[pos])) - pos = skip_ie(ies, ielen, pos); + while (pos < ielen) { + if (ies[pos] == WLAN_EID_EXTENSION) + ext = 2; + else + ext = 0; + + if ((pos + ext) >= ielen) + break; + + if (!ieee80211_id_in_list(after_ric, + n_after_ric, + ies[pos + ext], + ext == 2)) + pos = skip_ie(ies, ielen, pos); + } } else { pos = skip_ie(ies, ielen, pos); } -- cgit v1.2.3 From a7f26d8050c4f172d2dc523aabf45c5cbd9558ac Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 5 Aug 2017 11:44:32 +0300 Subject: mac80211: simplify and clarify IE splitting There's no need to split off IEs from the ones obtained from userspace, if they were already split off, so for example IEs that went before HT don't have to be listed again to go before VHT. Simplify the code here so it's clearer. While at it, also clarify the comments regarding the DMG (60 GHz) elements. Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 19 +++++++------------ net/mac80211/util.c | 21 +++++++++------------ 2 files changed, 16 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3b8e2709d8de..ee5ca1bc5a20 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -780,11 +780,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) WLAN_EID_SUPPORTED_REGULATORY_CLASSES, WLAN_EID_HT_CAPABILITY, WLAN_EID_BSS_COEX_2040, + /* luckily this is almost always there */ WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, - /* 60GHz doesn't happen right now */ + /* 60 GHz (Multi-band, DMG, MMS) can't happen */ WLAN_EID_VHT_CAPABILITY, WLAN_EID_OPMODE_NOTIF, }; @@ -811,22 +812,16 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) /* if present, add any custom IEs that go before VHT */ if (assoc_data->ie_len) { static const u8 before_vht[] = { - WLAN_EID_SSID, - WLAN_EID_SUPP_RATES, - WLAN_EID_EXT_SUPP_RATES, - WLAN_EID_PWR_CAPABILITY, - WLAN_EID_SUPPORTED_CHANNELS, - WLAN_EID_RSN, - WLAN_EID_QOS_CAPA, - WLAN_EID_RRM_ENABLED_CAPABILITIES, - WLAN_EID_MOBILITY_DOMAIN, - WLAN_EID_SUPPORTED_REGULATORY_CLASSES, - WLAN_EID_HT_CAPABILITY, + /* + * no need to list the ones split off before HT + * or generated here + */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, + /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; /* RIC already taken above, so no need to handle here anymore */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 6aef6793d052..bfecc3e86318 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1392,10 +1392,10 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { - WLAN_EID_SSID, - WLAN_EID_SUPP_RATES, - WLAN_EID_REQUEST, - WLAN_EID_EXT_SUPP_RATES, + /* + * no need to list the ones split off already + * (or generated here) + */ WLAN_EID_DS_PARAMS, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; @@ -1424,20 +1424,17 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { - WLAN_EID_SSID, - WLAN_EID_SUPP_RATES, - WLAN_EID_REQUEST, - WLAN_EID_EXT_SUPP_RATES, - WLAN_EID_DS_PARAMS, - WLAN_EID_SUPPORTED_REGULATORY_CLASSES, - WLAN_EID_HT_CAPABILITY, + /* + * no need to list the ones split off already + * (or generated here) + */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, WLAN_EID_MESH_ID, - /* 60 GHz can't happen here right now */ + /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; noffset = ieee80211_ie_split(ie, ie_len, before_vht, ARRAY_SIZE(before_vht), -- cgit v1.2.3 From ffa4629e0c2b8b015f5fa174149c6dd269b4142c Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Sat, 5 Aug 2017 11:44:38 +0300 Subject: nl80211: return error for invalid center_freq in 40 MHz When NL80211_ATTR_WIPHY_CHANNEL_TYPE is given, nl80211 would parse the channel definition the old way, discarding NL80211_ATTR_CENTER_FREQ1, NL80211_ATTR_CENTER_FREQ2 etc. However, it is possible that user space added both NL80211_ATTR_WIPHY_CHANNEL_TYPE and NL80211_ATTR_CENTER_FREQ1 or NL80211_ATTR_CENTER_FREQ2 assuming that all settings would be honored. In such a case, validate that NL80211_ATTR_CENTER_FREQ1 and NL80211_ATTR_CENTER_FREQ2 values match the channel configuration, as otherwise user space would assume that the desired configuration was applied. For example, when trying to start ap with NL80211_ATTR_WIPHY_CHANNEL_TYPE = NL80211_CHAN_HT40MINUS, NL80211_ATTR_WIPHY_FREQ = 5180 and NL80211_ATTR_CENTER_FREQ1 = 5250 without this fix, the ap will start on channel 36 (center_freq1 will be corrected to 5180). With this fix, we will throw an error instead. Signed-off-by: Tova Mussai Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0df8023f480b..66e97136ab44 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2122,6 +2122,15 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, case NL80211_CHAN_HT40MINUS: cfg80211_chandef_create(chandef, chandef->chan, chantype); + /* user input for center_freq is incorrect */ + if (info->attrs[NL80211_ATTR_CENTER_FREQ1] && + chandef->center_freq1 != nla_get_u32( + info->attrs[NL80211_ATTR_CENTER_FREQ1])) + return -EINVAL; + /* center_freq2 must be zero */ + if (info->attrs[NL80211_ATTR_CENTER_FREQ2] && + nla_get_u32(info->attrs[NL80211_ATTR_CENTER_FREQ2])) + return -EINVAL; break; default: return -EINVAL; -- cgit v1.2.3 From 2d23d0736e3a4a0fdb92b8e46ea476639f16aae8 Mon Sep 17 00:00:00 2001 From: Roee Zamir Date: Sun, 6 Aug 2017 11:38:22 +0300 Subject: nl80211: add OCE scan and capability flags Add Optimized Connectivity Experience (OCE) scan and capability flags. Some of them unique to OCE and some are stand alone. And add scan flags to enable/disable them. Signed-off-by: Roee Zamir Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 137 ++++++++++++++++++++++++++++--------------------- 1 file changed, 78 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 66e97136ab44..2e6f5f4065f9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6619,6 +6619,77 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev) return regulatory_pre_cac_allowed(wdev->wiphy); } +static int +nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, + void *request, struct nlattr **attrs, + bool is_sched_scan) +{ + u8 *mac_addr, *mac_addr_mask; + u32 *flags; + enum nl80211_feature_flags randomness_flag; + + if (!attrs[NL80211_ATTR_SCAN_FLAGS]) + return 0; + + if (is_sched_scan) { + struct cfg80211_sched_scan_request *req = request; + + randomness_flag = wdev ? + NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR : + NL80211_FEATURE_ND_RANDOM_MAC_ADDR; + flags = &req->flags; + mac_addr = req->mac_addr; + mac_addr_mask = req->mac_addr_mask; + } else { + struct cfg80211_scan_request *req = request; + + randomness_flag = NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR; + flags = &req->flags; + mac_addr = req->mac_addr; + mac_addr_mask = req->mac_addr_mask; + } + + *flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]); + + if ((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && + !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) + return -EOPNOTSUPP; + + if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { + int err; + + if (!(wiphy->features & randomness_flag) || + (wdev && wdev->current_bss)) + return -EOPNOTSUPP; + + err = nl80211_parse_random_mac(attrs, mac_addr, mac_addr_mask); + if (err) + return err; + } + + if ((*flags & NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME)) + return -EOPNOTSUPP; + + if ((*flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP)) + return -EOPNOTSUPP; + + if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION)) + return -EOPNOTSUPP; + + if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE)) + return -EOPNOTSUPP; + + return 0; +} + static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -6824,34 +6895,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]); } - if (info->attrs[NL80211_ATTR_SCAN_FLAGS]) { - request->flags = nla_get_u32( - info->attrs[NL80211_ATTR_SCAN_FLAGS]); - if ((request->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && - !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) { - err = -EOPNOTSUPP; - goto out_free; - } - - if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { - if (!(wiphy->features & - NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR)) { - err = -EOPNOTSUPP; - goto out_free; - } - - if (wdev->current_bss) { - err = -EOPNOTSUPP; - goto out_free; - } - - err = nl80211_parse_random_mac(info->attrs, - request->mac_addr, - request->mac_addr_mask); - if (err) - goto out_free; - } - } + err = nl80211_check_scan_flags(wiphy, wdev, request, info->attrs, + false); + if (err) + goto out_free; request->no_cck = nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]); @@ -7299,37 +7346,9 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, request->ie_len); } - if (attrs[NL80211_ATTR_SCAN_FLAGS]) { - request->flags = nla_get_u32( - attrs[NL80211_ATTR_SCAN_FLAGS]); - if ((request->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && - !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) { - err = -EOPNOTSUPP; - goto out_free; - } - - if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { - u32 flg = NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR; - - if (!wdev) /* must be net-detect */ - flg = NL80211_FEATURE_ND_RANDOM_MAC_ADDR; - - if (!(wiphy->features & flg)) { - err = -EOPNOTSUPP; - goto out_free; - } - - if (wdev && wdev->current_bss) { - err = -EOPNOTSUPP; - goto out_free; - } - - err = nl80211_parse_random_mac(attrs, request->mac_addr, - request->mac_addr_mask); - if (err) - goto out_free; - } - } + err = nl80211_check_scan_flags(wiphy, wdev, request, attrs, true); + if (err) + goto out_free; if (attrs[NL80211_ATTR_SCHED_SCAN_DELAY]) request->delay = -- cgit v1.2.3 From 40b0bd24973487272167a09db040a70c053bedbe Mon Sep 17 00:00:00 2001 From: Roee Zamir Date: Sun, 6 Aug 2017 11:38:23 +0300 Subject: mac80211: oce: enable receiving of bcast probe resp One of OCE's optimizations is acception of broadcast probe responses. Accept broadcast probe responses but don't set NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP. Because a device's firmware may filter out the broadcast probe resp - drivers should set this flag. Signed-off-by: Roee Zamir Signed-off-by: Luca Coelho [johannes: make accepting broadcast conditional on the nl80211 scan flag that was added for that specific purpose] Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 47d2ed570470..ef2becaade50 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -7,7 +7,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright 2016 Intel Deutschland GmbH + * Copyright 2016-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -183,6 +183,20 @@ ieee80211_bss_info_update(struct ieee80211_local *local, return bss; } +static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, + u32 scan_flags, const u8 *da) +{ + if (!sdata) + return false; + /* accept broadcast for OCE */ + if (scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP && + is_broadcast_ether_addr(da)) + return true; + if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) + return true; + return ether_addr_equal(da, sdata->vif.addr); +} + void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); @@ -208,19 +222,24 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) if (ieee80211_is_probe_resp(mgmt->frame_control)) { struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; + u32 scan_req_flags = 0, sched_scan_req_flags = 0; scan_req = rcu_dereference(local->scan_req); sched_scan_req = rcu_dereference(local->sched_scan_req); - /* ignore ProbeResp to foreign address unless scanning - * with randomised address + if (scan_req) + scan_req_flags = scan_req->flags; + + if (sched_scan_req) + sched_scan_req_flags = sched_scan_req->flags; + + /* ignore ProbeResp to foreign address or non-bcast (OCE) + * unless scanning with randomised address */ - if (!(sdata1 && - (ether_addr_equal(mgmt->da, sdata1->vif.addr) || - scan_req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR)) && - !(sdata2 && - (ether_addr_equal(mgmt->da, sdata2->vif.addr) || - sched_scan_req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR))) + if (!ieee80211_scan_accept_presp(sdata1, scan_req_flags, + mgmt->da) && + !ieee80211_scan_accept_presp(sdata2, sched_scan_req_flags, + mgmt->da)) return; elements = mgmt->u.probe_resp.variable; -- cgit v1.2.3 From 1272c5d89b597995cb10db87dd4a1adc91d36006 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 18 Aug 2017 15:33:56 +0300 Subject: mac80211: add documentation to ieee80211_rx_ba_offl() Add documentation to ieee80211_rx_ba_offl() function and, while at it, rename the bit argument to tid, for consistency. Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 2849a1fc41c5..88cc1ae935ea 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -459,7 +459,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, } void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, - const u8 *addr, unsigned int bit) + const u8 *addr, unsigned int tid) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; @@ -470,7 +470,7 @@ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, if (!sta) goto unlock; - set_bit(bit, sta->ampdu_mlme.tid_rx_manage_offl); + set_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl); ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); unlock: rcu_read_unlock(); -- cgit v1.2.3 From 1281103770e909e064edbb22a1115a0c14eca081 Mon Sep 17 00:00:00 2001 From: Ilan peer Date: Wed, 6 Sep 2017 17:18:33 +0300 Subject: mac80211: Simplify locking in ieee80211_sta_tear_down_BA_sessions() Simplify the locking in ieee80211_sta_tear_down_BA_sessions() and lock sta->ampdu_mlme.mtx over the entire function instead of locking/unlocking it for each TID etc. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index d6d0b4201e40..41f5e48f8021 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -290,13 +290,15 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, { int i; + mutex_lock(&sta->ampdu_mlme.mtx); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - __ieee80211_stop_tx_ba_session(sta, i, reason); - __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_LEAVE_QBSS, - reason != AGG_STOP_DESTROY_STA && - reason != AGG_STOP_PEER_REQUEST); + ___ieee80211_stop_tx_ba_session(sta, i, reason); + ___ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_LEAVE_QBSS, + reason != AGG_STOP_DESTROY_STA && + reason != AGG_STOP_PEER_REQUEST); } + mutex_unlock(&sta->ampdu_mlme.mtx); /* stopping might queue the work again - so cancel only afterwards */ cancel_work_sync(&sta->ampdu_mlme.work); -- cgit v1.2.3 From 4c121fd690d9c465e4cb09b7859adfdd6a0aee1d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 8 Sep 2017 11:54:46 +0200 Subject: mac80211: use offsetofend() This was created using the following spatch: @find@ type S; expression M, M2; position p; @@ offsetof(S, M) + sizeof(M2)@p @script:python@ m << find.M; m2 << find.M2; @@ if not m2.endswith('-> ' + m): cocci.include_match(False) @change@ type find.S; expression find.M, find.M2; position find.p; @@ -offsetof(S, M) + sizeof(M2)@p +offsetofend(S, M) Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 3 +-- net/mac80211/mesh_hwmp.c | 8 ++++---- net/mac80211/mesh_plink.c | 3 +-- net/mac80211/util.c | 4 ++-- 4 files changed, 8 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index a550c707cd8a..7a76c4a6df30 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -675,8 +675,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) enum nl80211_band band; u8 *pos; struct ieee80211_sub_if_data *sdata; - int hdr_len = offsetof(struct ieee80211_mgmt, u.beacon) + - sizeof(mgmt->u.beacon); + int hdr_len = offsetofend(struct ieee80211_mgmt, u.beacon); sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); rcu_read_lock(); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index d8bbd0d2225a..146ec6c0f12f 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -111,8 +111,8 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u8 *pos, ie_len; - int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.mesh_action) + - sizeof(mgmt->u.action.u.mesh_action); + int hdr_len = offsetofend(struct ieee80211_mgmt, + u.action.u.mesh_action); skb = dev_alloc_skb(local->tx_headroom + hdr_len + @@ -242,8 +242,8 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_mgmt *mgmt; u8 *pos, ie_len; - int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.mesh_action) + - sizeof(mgmt->u.action.u.mesh_action); + int hdr_len = offsetofend(struct ieee80211_mgmt, + u.action.u.mesh_action); if (time_before(jiffies, ifmsh->next_perr)) return -EAGAIN; diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index f69c6c38ca43..dc8e10f87207 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -220,8 +220,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, bool include_plid = false; u16 peering_proto = 0; u8 *pos, ie_len = 4; - int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.self_prot) + - sizeof(mgmt->u.action.u.self_prot); + int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.self_prot); int err = -ENOMEM; skb = dev_alloc_skb(local->tx_headroom + diff --git a/net/mac80211/util.c b/net/mac80211/util.c index bfecc3e86318..d57e5f6bd8b6 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2977,8 +2977,8 @@ int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; struct ieee80211_local *local = sdata->local; int freq; - int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.chan_switch) + - sizeof(mgmt->u.action.u.chan_switch); + int hdr_len = offsetofend(struct ieee80211_mgmt, + u.action.u.chan_switch); u8 *pos; if (sdata->vif.type != NL80211_IFTYPE_ADHOC && -- cgit v1.2.3 From a6bcda44843c6dfced0fb973e2607c2a98addfa9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Sep 2017 11:52:43 +0200 Subject: cfg80211: remove unused function ieee80211_data_from_8023() This function hasn't been used since the removal of iwmc3200wifi in 2012. It also appears to have a bug when qos=True, since then it'll copy uninitialized stack memory to the SKB. Just remove the function entirely. Reported-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/util.c | 115 ---------------------------------------------------- 1 file changed, 115 deletions(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index 4aab793c2f00..7dcdf67cba29 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -529,121 +529,6 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, } EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr); -int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype, - const u8 *bssid, bool qos) -{ - struct ieee80211_hdr hdr; - u16 hdrlen, ethertype; - __le16 fc; - const u8 *encaps_data; - int encaps_len, skip_header_bytes; - int nh_pos, h_pos; - int head_need; - - if (unlikely(skb->len < ETH_HLEN)) - return -EINVAL; - - nh_pos = skb_network_header(skb) - skb->data; - h_pos = skb_transport_header(skb) - skb->data; - - /* convert Ethernet header to proper 802.11 header (based on - * operation mode) */ - ethertype = (skb->data[12] << 8) | skb->data[13]; - fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); - - switch (iftype) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_P2P_GO: - fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); - /* DA BSSID SA */ - memcpy(hdr.addr1, skb->data, ETH_ALEN); - memcpy(hdr.addr2, addr, ETH_ALEN); - memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN); - hdrlen = 24; - break; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_P2P_CLIENT: - fc |= cpu_to_le16(IEEE80211_FCTL_TODS); - /* BSSID SA DA */ - memcpy(hdr.addr1, bssid, ETH_ALEN); - memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); - memcpy(hdr.addr3, skb->data, ETH_ALEN); - hdrlen = 24; - break; - case NL80211_IFTYPE_OCB: - case NL80211_IFTYPE_ADHOC: - /* DA SA BSSID */ - memcpy(hdr.addr1, skb->data, ETH_ALEN); - memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); - memcpy(hdr.addr3, bssid, ETH_ALEN); - hdrlen = 24; - break; - default: - return -EOPNOTSUPP; - } - - if (qos) { - fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); - hdrlen += 2; - } - - hdr.frame_control = fc; - hdr.duration_id = 0; - hdr.seq_ctrl = 0; - - skip_header_bytes = ETH_HLEN; - if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) { - encaps_data = bridge_tunnel_header; - encaps_len = sizeof(bridge_tunnel_header); - skip_header_bytes -= 2; - } else if (ethertype >= ETH_P_802_3_MIN) { - encaps_data = rfc1042_header; - encaps_len = sizeof(rfc1042_header); - skip_header_bytes -= 2; - } else { - encaps_data = NULL; - encaps_len = 0; - } - - skb_pull(skb, skip_header_bytes); - nh_pos -= skip_header_bytes; - h_pos -= skip_header_bytes; - - head_need = hdrlen + encaps_len - skb_headroom(skb); - - if (head_need > 0 || skb_cloned(skb)) { - head_need = max(head_need, 0); - if (head_need) - skb_orphan(skb); - - if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC)) - return -ENOMEM; - } - - if (encaps_data) { - memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); - nh_pos += encaps_len; - h_pos += encaps_len; - } - - memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); - - nh_pos += hdrlen; - h_pos += hdrlen; - - /* Update skb pointers to various headers since this modified frame - * is going to go through Linux networking code that may potentially - * need things like pointer to IP header. */ - skb_reset_mac_header(skb); - skb_set_network_header(skb, nh_pos); - skb_set_transport_header(skb, h_pos); - - return 0; -} -EXPORT_SYMBOL(ieee80211_data_from_8023); - static void __frame_add_frag(struct sk_buff *skb, struct page *page, void *ptr, int len, int size) -- cgit v1.2.3 From 65026002d69de006e273749bb799d3b01b757eb0 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 18 Aug 2017 15:31:41 +0300 Subject: nl80211: add an option to allow MFP without requiring it The user space can now allow the kernel to associate to an AP that requires MFP or that doesn't have MFP enabled in the same NL80211_CMD_CONNECT command, by using a new NL80211_MFP_OPTIONAL flag. The driver / firmware will decide whether to use it or not. Include a feature bit to advertise support for NL80211_MFP_OPTIONAL. This allows new user space to run on old kernels and know that it cannot use the new attribute if it isn't supported. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2e6f5f4065f9..1e39ba3cfd06 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8952,8 +8952,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_USE_MFP]) { connect.mfp = nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]); + if (connect.mfp == NL80211_MFP_OPTIONAL && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_MFP_OPTIONAL)) + return -EOPNOTSUPP; + if (connect.mfp != NL80211_MFP_REQUIRED && - connect.mfp != NL80211_MFP_NO) + connect.mfp != NL80211_MFP_NO && + connect.mfp != NL80211_MFP_OPTIONAL) return -EINVAL; } else { connect.mfp = NL80211_MFP_NO; -- cgit v1.2.3 From d405fd8cc807c045b23bc2df4a5ab6b85df614f3 Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Sat, 5 Aug 2017 11:44:36 +0300 Subject: mac80211: recalculate some sta parameters after insertion Sometimes a station is added already in ASSOC state. For example, in AP mode, when a client station didn't get assoc resp and sends an assoc req again. If a station is inserted when its state is ASSOC or higher, the min chandef and allow_p2p_go_ps should be recalculated again after the insertion. Before this patch the recalculation happened only in sta_info_move_state which occurs before the insertion of the sta and thus even though it calls ieee80211_recalc_min_chandef/_p2p_go_ps_allowed functions, since sdata->local->sta_list is still empty at this point, it doesn't do anything. Signed-off-by: Gregory Greenman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 57 +++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 69615016d5bf..ffcd25c4908c 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -515,6 +515,31 @@ static int sta_info_insert_drv_state(struct ieee80211_local *local, return err; } +static void +ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + bool allow_p2p_go_ps = sdata->vif.p2p; + struct sta_info *sta; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata || + !test_sta_flag(sta, WLAN_STA_ASSOC)) + continue; + if (!sta->sta.support_p2p_ps) { + allow_p2p_go_ps = false; + break; + } + } + rcu_read_unlock(); + + if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { + sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_P2P_PS); + } +} + /* * should be called with sta_mtx locked * this function replaces the mutex lock @@ -561,6 +586,13 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) goto out_remove; set_sta_flag(sta, WLAN_STA_INSERTED); + + if (sta->sta_state >= IEEE80211_STA_ASSOC) { + ieee80211_recalc_min_chandef(sta->sdata); + if (!sta->sta.support_p2p_ps) + ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); + } + /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); @@ -1788,31 +1820,6 @@ void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, } EXPORT_SYMBOL(ieee80211_sta_set_buffered); -static void -ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_local *local = sdata->local; - bool allow_p2p_go_ps = sdata->vif.p2p; - struct sta_info *sta; - - rcu_read_lock(); - list_for_each_entry_rcu(sta, &local->sta_list, list) { - if (sdata != sta->sdata || - !test_sta_flag(sta, WLAN_STA_ASSOC)) - continue; - if (!sta->sta.support_p2p_ps) { - allow_p2p_go_ps = false; - break; - } - } - rcu_read_unlock(); - - if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { - sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_P2P_PS); - } -} - int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { -- cgit v1.2.3 From 1bd773c077deeeb2d9ced1fdb6d846169b8e7e4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sch=C3=BCtz?= Date: Thu, 7 Sep 2017 17:47:43 +0200 Subject: wireless: set correct mandatory rate flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to IEEE Std 802.11-2016 (16.2.3.4 Long PHY SIGNAL field) all of the following rates are mandatory for a HR/DSSS PHY: 1 Mb/s, 2 Mb/s, 5.5 Mb/s and 11 Mb/s. Set IEEE80211_RATE_MANDATORY_B flag for all of these instead of just 1 Mb/s to correctly reflect this. Signed-off-by: Richard Schütz [johannes: use switch statement] Signed-off-by: Johannes Berg --- net/wireless/util.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index 7dcdf67cba29..7a1fcc6ee060 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -157,32 +157,30 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) case NL80211_BAND_2GHZ: want = 7; for (i = 0; i < sband->n_bitrates; i++) { - if (sband->bitrates[i].bitrate == 10) { + switch (sband->bitrates[i].bitrate) { + case 10: + case 20: + case 55: + case 110: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; want--; - } - - if (sband->bitrates[i].bitrate == 20 || - sband->bitrates[i].bitrate == 55 || - sband->bitrates[i].bitrate == 110 || - sband->bitrates[i].bitrate == 60 || - sband->bitrates[i].bitrate == 120 || - sband->bitrates[i].bitrate == 240) { + break; + case 60: + case 120: + case 240: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_G; want--; - } - - if (sband->bitrates[i].bitrate != 10 && - sband->bitrates[i].bitrate != 20 && - sband->bitrates[i].bitrate != 55 && - sband->bitrates[i].bitrate != 110) + /* fall through */ + default: sband->bitrates[i].flags |= IEEE80211_RATE_ERP_G; + break; + } } - WARN_ON(want != 0 && want != 3 && want != 6); + WARN_ON(want != 0 && want != 3); break; case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ -- cgit v1.2.3 From 6e617de84e87d626d1e976fc30e1322239fd4d2d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Sep 2017 18:26:53 +0200 Subject: net: avoid a full fib lookup when rp_filter is disabled. Since commit 1dced6a85482 ("ipv4: Restore accept_local behaviour in fib_validate_source()") a full fib lookup is needed even if the rp_filter is disabled, if accept_local is false - which is the default. What we really need in the above scenario is just checking that the source IP address is not local, and in most case we can do that is a cheaper way looking up the ifaddr hash table. This commit adds a helper for such lookup, and uses it to validate the src address when rp_filter is disabled and no 'local' routes are created by the user space in the relevant namespace. A new ipv4 netns flag is added to account for such routes. We need that to preserve the same behavior we had before this patch. It also drops the checks to bail early from __fib_validate_source, added by the commit 1dced6a85482 ("ipv4: Restore accept_local behaviour in fib_validate_source()") they do not give any measurable performance improvement: if we do the lookup with are on a slower path. This improves UDP performances for unconnected sockets when rp_filter is disabled by 5% and also gives small but measurable performance improvement for TCP flood scenarios. v1 -> v2: - use the ifaddr lookup helper in __ip_dev_find(), as suggested by Eric - fall-back to full lookup if custom local routes are present Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 30 ++++++++++++++++++------------ net/ipv4/fib_frontend.c | 22 +++++++++++++++++----- 2 files changed, 35 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7adc0616599..7ce22a2c07ce 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa) */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - u32 hash = inet_addr_hash(net, addr); struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) { - if (ifa->ifa_local == addr) { - struct net_device *dev = ifa->ifa_dev->dev; - - if (!net_eq(dev_net(dev), net)) - continue; - result = dev; - break; - } - } - if (!result) { + ifa = inet_lookup_ifaddr_rcu(net, addr); + if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; @@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); + } else { + result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); @@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) } EXPORT_SYMBOL(__ip_dev_find); +/* called under RCU lock */ +struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) +{ + u32 hash = inet_addr_hash(net, addr); + struct in_ifaddr *ifa; + + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) + if (ifa->ifa_local == addr && + net_eq(dev_net(ifa->ifa_dev->dev), net)) + return ifa; + + return NULL; +} + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 37819ab4cc74..f02819134ba2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -345,9 +345,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; - if (!rpf && !fib_num_tclassid_users(net) && - (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) - goto last_resort; fib_combine_itag(itag, &res); dev_match = false; @@ -402,13 +399,26 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + struct net *net = dev_net(dev); - if (!r && !fib_num_tclassid_users(dev_net(dev)) && - IN_DEV_ACCEPT_LOCAL(idev) && + if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { + if (IN_DEV_ACCEPT_LOCAL(idev)) + goto ok; + /* if no local routes are added from user space we can check + * for local addresses looking-up the ifaddr table + */ + if (net->ipv4.fib_has_custom_local_routes) + goto full_check; + if (inet_lookup_ifaddr_rcu(net, src)) + return -EINVAL; + +ok: *itag = 0; return 0; } + +full_check: return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); } @@ -759,6 +769,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, } err = fib_table_insert(net, tb, &cfg, extack); + if (!err && cfg.fc_type == RTN_LOCAL) + net->ipv4.fib_has_custom_local_routes = true; errout: return err; } -- cgit v1.2.3 From 4fa7b718881a5358286903b796968cea80993820 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 20 Sep 2017 19:31:57 -0400 Subject: net: dsa: better scoping of slave functions A few DSA slave functions take a dsa_slave_priv pointer as first argument, whereas the scope of the slave.c functions is the slave net_device structure. Fix this and rename dsa_netpoll_send_skb to dsa_slave_netpoll_send_skb. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 6fc9eb094267..4a98942a6135 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -385,10 +385,12 @@ static int dsa_slave_port_attr_get(struct net_device *dev, return 0; } -static inline netdev_tx_t dsa_netpoll_send_skb(struct dsa_slave_priv *p, - struct sk_buff *skb) +static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, + struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER + struct dsa_slave_priv *p = netdev_priv(dev); + if (p->netpoll) netpoll_send_skb(p->netpoll, skb); #else @@ -422,7 +424,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) * tag to be successfully transmitted */ if (unlikely(netpoll_tx_running(dev))) - return dsa_netpoll_send_skb(p, nskb); + return dsa_slave_netpoll_send_skb(dev, nskb); /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType @@ -736,9 +738,9 @@ static int dsa_slave_get_phys_port_name(struct net_device *dev, } static struct dsa_mall_tc_entry * -dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p, - unsigned long cookie) +dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) { + struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) @@ -820,7 +822,7 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev, if (!ds->ops->port_mirror_del) return; - mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie); + mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; @@ -1027,10 +1029,9 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, } /* slave device setup *******************************************************/ -static int dsa_slave_phy_connect(struct dsa_slave_priv *p, - struct net_device *slave_dev, - int addr) +static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) { + struct dsa_slave_priv *p = netdev_priv(slave_dev); struct dsa_switch *ds = p->dp->ds; p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); @@ -1046,9 +1047,9 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, p->phy_interface); } -static int dsa_slave_phy_setup(struct dsa_slave_priv *p, - struct net_device *slave_dev) +static int dsa_slave_phy_setup(struct net_device *slave_dev) { + struct dsa_slave_priv *p = netdev_priv(slave_dev); struct dsa_switch *ds = p->dp->ds; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; @@ -1088,7 +1089,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, */ if (!phy_is_fixed && phy_id >= 0 && (ds->phys_mii_mask & (1 << phy_id))) { - ret = dsa_slave_phy_connect(p, slave_dev, phy_id); + ret = dsa_slave_phy_connect(slave_dev, phy_id); if (ret) { netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret); of_node_put(phy_dn); @@ -1111,7 +1112,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, * MDIO bus instead */ if (!p->phy) { - ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index); + ret = dsa_slave_phy_connect(slave_dev, p->dp->index); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->dp->index, ret); @@ -1233,7 +1234,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name) netif_carrier_off(slave_dev); - ret = dsa_slave_phy_setup(p, slave_dev); + ret = dsa_slave_phy_setup(slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); unregister_netdev(slave_dev); -- cgit v1.2.3 From de40fc5d210f2c31f2c25a9b920861276a71b70d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 20 Sep 2017 19:32:14 -0400 Subject: net: dsa: add port fdb dump Dumping a DSA port's FDB entries is not specific to a DSA slave, so add a dsa_port_fdb_dump function, similarly to dsa_port_fdb_add and dsa_port_fdb_del. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 1 + net/dsa/port.c | 11 +++++++++++ net/dsa/slave.c | 9 ++------- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f616b3444418..9803952a5b40 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -128,6 +128,7 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); +int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); int dsa_port_mdb_add(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans); diff --git a/net/dsa/port.c b/net/dsa/port.c index 659676ba3f8b..76d43a82d397 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -173,6 +173,17 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info); } +int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (!ds->ops->port_fdb_dump) + return -EOPNOTSUPP; + + return ds->ops->port_fdb_dump(ds, port, cb, data); +} + int dsa_port_mdb_add(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4a98942a6135..02ace7d462c4 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -263,16 +263,11 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, }; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_port *dp = p->dp; - struct dsa_switch *ds = dp->ds; int err; - if (!ds->ops->port_fdb_dump) - return -EOPNOTSUPP; - - err = ds->ops->port_fdb_dump(ds, dp->index, - dsa_slave_port_fdb_do_dump, - &dump); + err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump); *idx = dump.idx; + return err; } -- cgit v1.2.3 From 51957bc53aa7ca60d63f1ba0d9e3d887562b1723 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:17:34 +0200 Subject: net/smc: parameter cleanup in smc_cdc_get_free_slot() Use the smc_connection as first parameter with smc_cdc_get_free_slot(). This is just a small code cleanup, no functional change. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 7 ++++--- net/smc/smc_cdc.h | 3 ++- net/smc/smc_tx.c | 6 ++---- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a7294edbc221..5ef97e5a5f78 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -62,10 +62,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, bh_unlock_sock(&smc->sk); } -int smc_cdc_get_free_slot(struct smc_link *link, +int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_wr_buf **wr_buf, struct smc_cdc_tx_pend **pend) { + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, (struct smc_wr_tx_pend_priv **)pend); } @@ -118,8 +120,7 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; - rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, - &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (rc) return rc; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 8e1d76f26007..56f883d1159c 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -206,7 +206,8 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, struct smc_cdc_tx_pend; -int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf, +int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_wr_buf **wr_buf, struct smc_cdc_tx_pend **pend); void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 3c656beb8820..e2228f6d1c25 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -394,8 +394,7 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) int rc; spin_lock_bh(&conn->send_lock); - rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, - &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (rc < 0) { if (rc == -EBUSY) { struct smc_sock *smc = @@ -463,8 +462,7 @@ void smc_tx_consumer_update(struct smc_connection *conn) ((to_confirm > conn->rmbe_update_limit) && ((to_confirm > (conn->rmbe_size / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { - rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], - &wr_buf, &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (!rc) rc = smc_cdc_msg_send(conn, wr_buf, pend); if (rc < 0) { -- cgit v1.2.3 From 8701352b22a2f82c814283131587e970a56b69a8 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Thu, 21 Sep 2017 12:05:25 +0200 Subject: bridge: trigger RTM_NEWLINK when interface is modified by bridge ioctl Currently, there is a difference in netlink events received when an interface is modified through bridge ioctl() or through netlink. This patch generates additional events when an interface is added to or removed from a bridge via ioctl(). When adding then removing an interface from a bridge with netlink, we get: 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 Deleted 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff When using ioctl(): 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 Deleted 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff Without this patch, the last netlink notification is not sent. Signed-off-by: Vincent Bernat Reviewed-by: Stephen Hemminger Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 7970f8540cbb..66cd98772051 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -102,6 +102,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) else ret = br_del_if(br, dev); + if (!ret) + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); + return ret; } -- cgit v1.2.3 From 2ed343f98178ff232b504bbd006b34db07835082 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 16:29:33 +0530 Subject: net:nfc: use setup_timer Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/nfc/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/nfc/core.c b/net/nfc/core.c index 5cf33df888c3..e5e23c2cbe74 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -1094,9 +1094,8 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, dev->targets_generation = 1; if (ops->check_presence) { - init_timer(&dev->check_pres_timer); - dev->check_pres_timer.data = (unsigned long)dev; - dev->check_pres_timer.function = nfc_check_pres_timeout; + setup_timer(&dev->check_pres_timer, nfc_check_pres_timeout, + (unsigned long)dev); INIT_WORK(&dev->check_pres_work, nfc_check_pres_work); } -- cgit v1.2.3 From 77478715ba9242017976fd01de189e77fa072f51 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Sep 2017 18:23:17 -0400 Subject: ceph: use get_user_pages_fast() Signed-off-by: Al Viro --- net/ceph/pagevec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 1a7c9a79a53c..4098b17d0812 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -24,9 +24,9 @@ struct page **ceph_get_direct_page_vector(const void __user *data, return ERR_PTR(-ENOMEM); while (got < num_pages) { - rc = get_user_pages_unlocked( + rc = get_user_pages_fast( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, pages + got, write_page ? FOLL_WRITE : 0); + num_pages - got, write_page, pages + got); if (rc < 0) break; BUG_ON(rc == 0); -- cgit v1.2.3 From 242c1a28eb61cb34974e8aa05235d84355940a8a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Fri, 22 Sep 2017 10:25:22 +0800 Subject: net: Remove useless function skb_header_release There is no one which would invokes the function skb_header_release. So just remove it now. Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- net/batman-adv/soft-interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 10f7edfb176e..c2c986746d0b 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -69,7 +69,7 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) int result; /* TODO: We must check if we can release all references to non-payload - * data using skb_header_release in our skbs to allow skb_cow_header to + * data using __skb_header_release in our skbs to allow skb_cow_header to * work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer -- cgit v1.2.3 From 52a59bd509e3dc458be99dcf333b778e6e3b3749 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:33:29 +0300 Subject: net: use 32-bit arithmetic while allocating net device Private part of allocation is never big enough to warrant size_t. Space savings: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-10 (-10) function old new delta alloc_netdev_mqs 1120 1110 -10 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index fb766d906148..37d6a3c59e69 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7989,7 +7989,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned int txqs, unsigned int rxqs) { struct net_device *dev; - size_t alloc_size; + unsigned int alloc_size; struct net_device *p; BUG_ON(strlen(name) >= sizeof(dev->name)); -- cgit v1.2.3 From 373b8eeb0c15d4ce58f62afb12f213b1b5bbc3d3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:45:43 +0300 Subject: xfrm: make aead_len() return unsigned int Key lengths can't be negative. Comparison with nla_len() is left signed just in case negative value can sneak in there. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2bfbd9121e3b..32c67b80c3ce 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -84,7 +84,7 @@ static int verify_aead(struct nlattr **attrs) return 0; algp = nla_data(rt); - if (nla_len(rt) < aead_len(algp)) + if (nla_len(rt) < (int)aead_len(algp)) return -EINVAL; algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; -- cgit v1.2.3 From 06cd22f830f28023b82455c82c7db65fc6cf9c16 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:46:30 +0300 Subject: xfrm: make xfrm_alg_len() return unsigned int Key lengths can't be negative. Comparison with nla_len() is left signed just in case negative value can sneak in there. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 32c67b80c3ce..09512d90e6a5 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -42,7 +42,7 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type) return 0; algp = nla_data(rt); - if (nla_len(rt) < xfrm_alg_len(algp)) + if (nla_len(rt) < (int)xfrm_alg_len(algp)) return -EINVAL; switch (type) { -- cgit v1.2.3 From 1bd963a72e859d194d87a5a2a8839efee7e23102 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:47:09 +0300 Subject: xfrm: make xfrm_alg_auth_len() return unsigned int Key lengths can't be negative. Comparison with nla_len() is left signed just in case negative value can sneak in there. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 09512d90e6a5..465c23d4ea78 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -68,7 +68,7 @@ static int verify_auth_trunc(struct nlattr **attrs) return 0; algp = nla_data(rt); - if (nla_len(rt) < xfrm_alg_auth_len(algp)) + if (nla_len(rt) < (int)xfrm_alg_auth_len(algp)) return -EINVAL; algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; -- cgit v1.2.3 From 5e708e47c44366453c33373940455a75fd33f635 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:47:50 +0300 Subject: xfrm: make xfrm_replay_state_esn_len() return unsigned int Replay detection bitmaps can't have negative length. Comparisons with nla_len() are left signed just in case negative value can sneak in there. Propagate unsignedness for code size savings: add/remove: 0/0 grow/shrink: 0/5 up/down: 0/-38 (-38) function old new delta xfrm_state_construct 1802 1800 -2 xfrm_update_ae_params 295 289 -6 xfrm_state_migrate 1345 1339 -6 xfrm_replay_notify_esn 349 337 -12 xfrm_replay_notify_bmp 345 333 -12 Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 465c23d4ea78..83718db5ec9c 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -130,7 +130,7 @@ static inline int verify_replay(struct xfrm_usersa_info *p, if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) return -EINVAL; - if (nla_len(rt) < xfrm_replay_state_esn_len(rs) && + if (nla_len(rt) < (int)xfrm_replay_state_esn_len(rs) && nla_len(rt) != sizeof(*rs)) return -EINVAL; } @@ -404,7 +404,7 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es struct nlattr *rp) { struct xfrm_replay_state_esn *up; - int ulen; + unsigned int ulen; if (!replay_esn || !rp) return 0; @@ -414,7 +414,7 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es /* Check the overall length and the internal bitmap length to avoid * potential overflow. */ - if (nla_len(rp) < ulen || + if (nla_len(rp) < (int)ulen || xfrm_replay_state_esn_len(replay_esn) != ulen || replay_esn->bmp_len != up->bmp_len) return -EINVAL; @@ -430,14 +430,14 @@ static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn struct nlattr *rta) { struct xfrm_replay_state_esn *p, *pp, *up; - int klen, ulen; + unsigned int klen, ulen; if (!rta) return 0; up = nla_data(rta); klen = xfrm_replay_state_esn_len(up); - ulen = nla_len(rta) >= klen ? klen : sizeof(*up); + ulen = nla_len(rta) >= (int)klen ? klen : sizeof(*up); p = kzalloc(klen, GFP_KERNEL); if (!p) -- cgit v1.2.3 From a1b831f23a2b3306ecf275872a54abcf97b0b977 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:48:54 +0300 Subject: xfrm: eradicate size_t All netlink message sizes are a) unsigned, b) can't be >= 4GB in size because netlink doesn't support >= 64KB messages in the first place. All those size_t across the code are a scam especially across networking which likes to work with small numbers like 1500 or 65536. Propagate unsignedness and flip some "int" to "unsigned int" as well. This is preparation to switching nlmsg_new() to "unsigned int". Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 83718db5ec9c..f7a12aa15086 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -458,9 +458,9 @@ static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn return 0; } -static inline int xfrm_user_sec_ctx_size(struct xfrm_sec_ctx *xfrm_ctx) +static inline unsigned int xfrm_user_sec_ctx_size(struct xfrm_sec_ctx *xfrm_ctx) { - int len = 0; + unsigned int len = 0; if (xfrm_ctx) { len += sizeof(struct xfrm_user_sec_ctx); @@ -1031,7 +1031,7 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb, return -1; } -static inline size_t xfrm_spdinfo_msgsize(void) +static inline unsigned int xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) + nla_total_size(sizeof(struct xfrmu_spdinfo)) @@ -1157,7 +1157,7 @@ static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); } -static inline size_t xfrm_sadinfo_msgsize(void) +static inline unsigned int xfrm_sadinfo_msgsize(void) { return NLMSG_ALIGN(4) + nla_total_size(sizeof(struct xfrmu_sadhinfo)) @@ -1633,7 +1633,7 @@ static inline int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *s return copy_sec_ctx(xp->security, skb); return 0; } -static inline size_t userpolicy_type_attrsize(void) +static inline unsigned int userpolicy_type_attrsize(void) { #ifdef CONFIG_XFRM_SUB_POLICY return nla_total_size(sizeof(struct xfrm_userpolicy_type)); @@ -1850,9 +1850,9 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } -static inline size_t xfrm_aevent_msgsize(struct xfrm_state *x) +static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x) { - size_t replay_size = x->replay_esn ? + unsigned int replay_size = x->replay_esn ? xfrm_replay_state_esn_len(x->replay_esn) : sizeof(struct xfrm_replay_state); @@ -2321,8 +2321,8 @@ static int copy_to_user_kmaddress(const struct xfrm_kmaddress *k, struct sk_buff return nla_put(skb, XFRMA_KMADDRESS, sizeof(uk), &uk); } -static inline size_t xfrm_migrate_msgsize(int num_migrate, int with_kma, - int with_encp) +static inline unsigned int xfrm_migrate_msgsize(int num_migrate, int with_kma, + int with_encp) { return NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_id)) + (with_kma ? nla_total_size(sizeof(struct xfrm_kmaddress)) : 0) @@ -2566,7 +2566,7 @@ static void xfrm_netlink_rcv(struct sk_buff *skb) mutex_unlock(&net->xfrm.xfrm_cfg_mutex); } -static inline size_t xfrm_expire_msgsize(void) +static inline unsigned int xfrm_expire_msgsize(void) { return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) + nla_total_size(sizeof(struct xfrm_mark)); @@ -2654,9 +2654,9 @@ static int xfrm_notify_sa_flush(const struct km_event *c) return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA); } -static inline size_t xfrm_sa_len(struct xfrm_state *x) +static inline unsigned int xfrm_sa_len(struct xfrm_state *x) { - size_t l = 0; + unsigned int l = 0; if (x->aead) l += nla_total_size(aead_len(x->aead)); if (x->aalg) { @@ -2701,8 +2701,9 @@ static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c) struct xfrm_usersa_id *id; struct nlmsghdr *nlh; struct sk_buff *skb; - int len = xfrm_sa_len(x); - int headlen, err; + unsigned int len = xfrm_sa_len(x); + unsigned int headlen; + int err; headlen = sizeof(*p); if (c->event == XFRM_MSG_DELSA) { @@ -2776,8 +2777,8 @@ static int xfrm_send_state_notify(struct xfrm_state *x, const struct km_event *c } -static inline size_t xfrm_acquire_msgsize(struct xfrm_state *x, - struct xfrm_policy *xp) +static inline unsigned int xfrm_acquire_msgsize(struct xfrm_state *x, + struct xfrm_policy *xp) { return NLMSG_ALIGN(sizeof(struct xfrm_user_acquire)) + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) @@ -2900,7 +2901,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, return xp; } -static inline size_t xfrm_polexpire_msgsize(struct xfrm_policy *xp) +static inline unsigned int xfrm_polexpire_msgsize(struct xfrm_policy *xp) { return NLMSG_ALIGN(sizeof(struct xfrm_user_polexpire)) + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) @@ -2957,13 +2958,14 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) { - int len = nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); + unsigned int len = nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); struct net *net = xp_net(xp); struct xfrm_userpolicy_info *p; struct xfrm_userpolicy_id *id; struct nlmsghdr *nlh; struct sk_buff *skb; - int headlen, err; + unsigned int headlen; + int err; headlen = sizeof(*p); if (c->event == XFRM_MSG_DELPOLICY) { @@ -3070,7 +3072,7 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct } -static inline size_t xfrm_report_msgsize(void) +static inline unsigned int xfrm_report_msgsize(void) { return NLMSG_ALIGN(sizeof(struct xfrm_user_report)); } @@ -3115,7 +3117,7 @@ static int xfrm_send_report(struct net *net, u8 proto, return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_REPORT); } -static inline size_t xfrm_mapping_msgsize(void) +static inline unsigned int xfrm_mapping_msgsize(void) { return NLMSG_ALIGN(sizeof(struct xfrm_user_mapping)); } -- cgit v1.2.3 From 2d04cfcfa2e86cac751979b584e5420dd470903b Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 25 Sep 2017 13:00:02 +0530 Subject: net: af_packet: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d288f52c53f7..b0f221885dfd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -544,9 +544,7 @@ static void prb_init_blk_timer(struct packet_sock *po, struct tpacket_kbdq_core *pkc, void (*func) (unsigned long)) { - init_timer(&pkc->retire_blk_timer); - pkc->retire_blk_timer.data = (long)po; - pkc->retire_blk_timer.function = func; + setup_timer(&pkc->retire_blk_timer, func, (long)po); pkc->retire_blk_timer.expires = jiffies; } -- cgit v1.2.3 From b1e07c5486f0df26caf93234bd7db5545a37aba8 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 25 Sep 2017 13:00:03 +0530 Subject: net: nfc: hci: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/nfc/hci/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index b740fef0acc5..a8a6e7814e09 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -1004,9 +1004,8 @@ int nfc_hci_register_device(struct nfc_hci_dev *hdev) INIT_WORK(&hdev->msg_tx_work, nfc_hci_msg_tx_work); - init_timer(&hdev->cmd_timer); - hdev->cmd_timer.data = (unsigned long)hdev; - hdev->cmd_timer.function = nfc_hci_cmd_timeout; + setup_timer(&hdev->cmd_timer, nfc_hci_cmd_timeout, + (unsigned long)hdev); skb_queue_head_init(&hdev->rx_hcp_frags); -- cgit v1.2.3 From 22d387e13ac357279ddac55694883fd3e30a5639 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 25 Sep 2017 13:00:04 +0530 Subject: net: nfc: hci: llc_shdlc: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/nfc/hci/llc_shdlc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 17e59a009ce6..58df37eae1e8 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -763,17 +763,14 @@ static void *llc_shdlc_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv, mutex_init(&shdlc->state_mutex); shdlc->state = SHDLC_DISCONNECTED; - init_timer(&shdlc->connect_timer); - shdlc->connect_timer.data = (unsigned long)shdlc; - shdlc->connect_timer.function = llc_shdlc_connect_timeout; + setup_timer(&shdlc->connect_timer, llc_shdlc_connect_timeout, + (unsigned long)shdlc); - init_timer(&shdlc->t1_timer); - shdlc->t1_timer.data = (unsigned long)shdlc; - shdlc->t1_timer.function = llc_shdlc_t1_timeout; + setup_timer(&shdlc->t1_timer, llc_shdlc_t1_timeout, + (unsigned long)shdlc); - init_timer(&shdlc->t2_timer); - shdlc->t2_timer.data = (unsigned long)shdlc; - shdlc->t2_timer.function = llc_shdlc_t2_timeout; + setup_timer(&shdlc->t2_timer, llc_shdlc_t2_timeout, + (unsigned long)shdlc); shdlc->w = SHDLC_MAX_WINDOW; shdlc->srej_support = SHDLC_SREJ_SUPPORT; -- cgit v1.2.3 From d835b63cc4ee67e59eed9d1957f729c0a30b7331 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 25 Sep 2017 13:00:05 +0530 Subject: net: nfc: llcp_core: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/nfc/llcp_core.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 02eef5cf3cce..7988185072e5 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -1573,9 +1573,8 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); - init_timer(&local->link_timer); - local->link_timer.data = (unsigned long) local; - local->link_timer.function = nfc_llcp_symm_timer; + setup_timer(&local->link_timer, nfc_llcp_symm_timer, + (unsigned long)local); skb_queue_head_init(&local->tx_queue); INIT_WORK(&local->tx_work, nfc_llcp_tx_work); @@ -1601,9 +1600,8 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) mutex_init(&local->sdreq_lock); INIT_HLIST_HEAD(&local->pending_sdreqs); - init_timer(&local->sdreq_timer); - local->sdreq_timer.data = (unsigned long) local; - local->sdreq_timer.function = nfc_llcp_sdreq_timer; + setup_timer(&local->sdreq_timer, nfc_llcp_sdreq_timer, + (unsigned long)local); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); list_add(&local->list, &llcp_devices); -- cgit v1.2.3 From 6457edfe7344ac1da334b9f24a42aacea084a451 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 22 Sep 2017 19:01:55 -0400 Subject: net: dsa: make slave close symmetrical to open The DSA slave open function configures the unicast MAC addresses on the master device, enable the switch port, change its STP state, then start the PHY device. Make the close function symmetric, by first stopping the PHY device, then changing the STP state, disabling the switch port and restore the master device. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/slave.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 02ace7d462c4..c2bb48579032 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -133,6 +133,11 @@ static int dsa_slave_close(struct net_device *dev) if (p->phy) phy_stop(p->phy); + dsa_port_set_state_now(p->dp, BR_STATE_DISABLED); + + if (ds->ops->port_disable) + ds->ops->port_disable(ds, p->dp->index, p->phy); + dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); if (dev->flags & IFF_ALLMULTI) @@ -143,11 +148,6 @@ static int dsa_slave_close(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); - if (ds->ops->port_disable) - ds->ops->port_disable(ds, p->dp->index, p->phy); - - dsa_port_set_state_now(p->dp, BR_STATE_DISABLED); - return 0; } -- cgit v1.2.3 From fb8a6a2b8b7cfee8a0cf2cd431e1d0f45dd1c9e0 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 22 Sep 2017 19:01:56 -0400 Subject: net: dsa: add port enable and disable helpers Provide dsa_port_enable and dsa_port_disable helpers to respectively enable and disable a switch port. This makes the dsa_port_set_state_now helper static. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 3 ++- net/dsa/port.c | 31 ++++++++++++++++++++++++++++++- net/dsa/slave.c | 19 +++++-------------- 3 files changed, 37 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9803952a5b40..0298a0f6a349 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -117,7 +117,8 @@ void dsa_master_ethtool_restore(struct net_device *dev); /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); -void dsa_port_set_state_now(struct dsa_port *dp, u8 state); +int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); +void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, diff --git a/net/dsa/port.c b/net/dsa/port.c index 76d43a82d397..72c8dbd3d3f2 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -56,7 +56,7 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state, return 0; } -void dsa_port_set_state_now(struct dsa_port *dp, u8 state) +static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) { int err; @@ -65,6 +65,35 @@ void dsa_port_set_state_now(struct dsa_port *dp, u8 state) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } +int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) +{ + u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; + struct dsa_switch *ds = dp->ds; + int port = dp->index; + int err; + + if (ds->ops->port_enable) { + err = ds->ops->port_enable(ds, port, phy); + if (err) + return err; + } + + dsa_port_set_state_now(dp, stp_state); + + return 0; +} + +void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + dsa_port_set_state_now(dp, BR_STATE_DISABLED); + + if (ds->ops->port_disable) + ds->ops->port_disable(ds, port, phy); +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index c2bb48579032..bd51ef56ec5b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -73,9 +73,7 @@ static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_port *dp = p->dp; - struct dsa_switch *ds = dp->ds; struct net_device *master = dsa_master_netdev(p); - u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; int err; if (!(master->flags & IFF_UP)) @@ -98,13 +96,9 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - if (ds->ops->port_enable) { - err = ds->ops->port_enable(ds, p->dp->index, p->phy); - if (err) - goto clear_promisc; - } - - dsa_port_set_state_now(p->dp, stp_state); + err = dsa_port_enable(dp, p->phy); + if (err) + goto clear_promisc; if (p->phy) phy_start(p->phy); @@ -128,15 +122,12 @@ static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = dsa_master_netdev(p); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = p->dp; if (p->phy) phy_stop(p->phy); - dsa_port_set_state_now(p->dp, BR_STATE_DISABLED); - - if (ds->ops->port_disable) - ds->ops->port_disable(ds, p->dp->index, p->phy); + dsa_port_disable(dp, p->phy); dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); -- cgit v1.2.3 From 3aa605f28b0d004a640a826380b39c7dcf70195d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 23 Sep 2017 11:07:28 -0700 Subject: sch_netem: faster rb tree removal While running TCP tests involving netem storing millions of packets, I had the idea to speed up tfifo_reset() and did experiments. I tried the rbtree_postorder_for_each_entry_safe() method that is used in skb_rbtree_purge() but discovered it was slower than the current tfifo_reset() method. I measured time taken to release skbs with three occupation levels : 10^4, 10^5 and 10^6 skbs with three methods : 1) (current 'naive' method) while ((p = rb_first(&q->t_root))) { struct sk_buff *skb = netem_rb_to_skb(p); rb_erase(p, &q->t_root); rtnl_kfree_skbs(skb, skb); } 2) Use rb_next() instead of rb_first() in the loop : p = rb_first(&q->t_root); while (p) { struct sk_buff *skb = netem_rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } 3) "optimized" method using rbtree_postorder_for_each_entry_safe() struct sk_buff *skb, *next; rbtree_postorder_for_each_entry_safe(skb, next, &q->t_root, rbnode) { rtnl_kfree_skbs(skb, skb); } q->t_root = RB_ROOT; Results : method_1:while (rb_first()) rb_erase() 10000 skbs in 690378 ns (69 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 10000 skbs in 541846 ns (54 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 10000 skbs in 868307 ns (86 ns per skb) method_1:while (rb_first()) rb_erase() 99996 skbs in 7804021 ns (78 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 100000 skbs in 5942456 ns (59 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 100000 skbs in 11584940 ns (115 ns per skb) method_1:while (rb_first()) rb_erase() 1000000 skbs in 108577838 ns (108 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 1000000 skbs in 82619635 ns (82 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 1000000 skbs in 127328743 ns (127 ns per skb) Method 2) is simply faster, probably because it maintains a smaller working size set. Note that this is the method we use in tcp_ofo_queue() already. I will also change skb_rbtree_purge() in a second patch. Signed-off-by: Eric Dumazet Acked-by: David Ahern Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 063a4bdb9ee6..5a4f10080290 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -361,12 +361,13 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche static void tfifo_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - struct rb_node *p; + struct rb_node *p = rb_first(&q->t_root); - while ((p = rb_first(&q->t_root))) { + while (p) { struct sk_buff *skb = netem_rb_to_skb(p); - rb_erase(p, &q->t_root); + p = rb_next(p); + rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } } -- cgit v1.2.3 From 7c90584c66cc4b033a3b684b0e0950f79e7b7166 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 23 Sep 2017 12:39:12 -0700 Subject: net: speed up skb_rbtree_purge() As measured in my prior patch ("sch_netem: faster rb tree removal"), rbtree_postorder_for_each_entry_safe() is nice looking but much slower than using rb_next() directly, except when tree is small enough to fit in CPU caches (then the cost is the same) Also note that there is not even an increase of text size : $ size net/core/skbuff.o.before net/core/skbuff.o text data bss dec hex filename 40711 1298 0 42009 a419 net/core/skbuff.o.before 40711 1298 0 42009 a419 net/core/skbuff.o From: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16982de649b9..000ce735fa8d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2848,12 +2848,15 @@ EXPORT_SYMBOL(skb_queue_purge); */ void skb_rbtree_purge(struct rb_root *root) { - struct sk_buff *skb, *next; + struct rb_node *p = rb_first(root); - rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) - kfree_skb(skb); + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); - *root = RB_ROOT; + p = rb_next(p); + rb_erase(&skb->rbnode, root); + kfree_skb(skb); + } } /** -- cgit v1.2.3 From 01ccdf126ca5f9d4fe0889f65ee67afac910f19c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 23 Sep 2017 23:03:04 +0300 Subject: neigh: make strucrt neigh_table::entry_size unsigned int Key length can't be negative. Leave comparisons against nla_len() signed just in case truncated attribute can sneak in there. Space savings: add/remove: 0/0 grow/shrink: 0/7 up/down: 0/-7 (-7) function old new delta pneigh_delete 273 272 -1 mlx5e_rep_netevent_event 1415 1414 -1 mlx5e_create_encap_header_ipv6 1194 1193 -1 mlx5e_create_encap_header_ipv4 1071 1070 -1 cxgb4_l2t_get 1104 1103 -1 __pneigh_lookup 69 68 -1 __neigh_create 2452 2451 -1 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/neighbour.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 16a1a4c4eb57..6ea3a1a7f36a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -457,7 +457,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, const void *pkey) { struct neighbour *n; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val; struct neigh_hash_table *nht; @@ -488,7 +488,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { u32 hash_val; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; @@ -572,7 +572,7 @@ out_neigh_release: } EXPORT_SYMBOL(__neigh_create); -static u32 pneigh_hash(const void *pkey, int key_len) +static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); @@ -585,7 +585,7 @@ static u32 pneigh_hash(const void *pkey, int key_len) static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, struct net *net, const void *pkey, - int key_len, + unsigned int key_len, struct net_device *dev) { while (n) { @@ -601,7 +601,7 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); return __pneigh_lookup_1(tbl->phash_buckets[hash_val], @@ -614,7 +614,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, struct net_device *dev, int creat) { struct pneigh_entry *n; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); @@ -659,7 +659,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); @@ -1662,7 +1662,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) goto out; if (ndm->ndm_flags & NTF_PROXY) { @@ -1730,7 +1730,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) goto out; dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; -- cgit v1.2.3 From 98e4fcff3e755c6c3de2d4f63c080b4009a599bd Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 11:21:42 +0200 Subject: datagram: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index f7fb7e3f2acf..0b7b4c22719e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -188,7 +188,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, } if (!skb->len) { skb = skb_set_peeked(skb); - if (unlikely(IS_ERR(skb))) { + if (IS_ERR(skb)) { *err = PTR_ERR(skb); return NULL; } -- cgit v1.2.3 From 63a4e80be4f523dbde6412decfa8244ff73cc4b9 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 11:22:31 +0200 Subject: ipv6: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 96861c702c06..13c3b697f8c0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3297,7 +3297,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, struct rt6_info *rt, *prev; rt = addrconf_dst_alloc(idev, &ifp->addr, false); - if (unlikely(IS_ERR(rt))) + if (IS_ERR(rt)) return PTR_ERR(rt); /* ifp->rt can be accessed outside of rtnl */ -- cgit v1.2.3 From d9db5e3680c87fdbdf86fcb1c42caea4bf98680c Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 11:22:58 +0200 Subject: kcm: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index af4e76ac88ff..0b750a22c4b9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1650,7 +1650,7 @@ static int kcm_clone(struct socket *osock, struct kcm_clone *info, } newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (unlikely(IS_ERR(newfile))) { + if (IS_ERR(newfile)) { err = PTR_ERR(newfile); goto out_sock_alloc_fail; } -- cgit v1.2.3 From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:50 +0200 Subject: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers Just do the rename into bpf_compute_data_pointers() as we'll add one more pointer here to recompute. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/bpf/test_run.c | 2 +- net/core/filter.c | 14 +++++++------- net/core/lwt_bpf.c | 2 +- net/sched/act_bpf.c | 4 ++-- net/sched/cls_bpf.c | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 6be41a44d688..df672517b4fd 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (is_l2) __skb_push(skb, ETH_HLEN); if (is_direct_pkt_access) - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); retval = bpf_test_run(prog, skb, repeat, &duration); if (!is_l2) __skb_push(skb, ETH_HLEN); diff --git a/net/core/filter.c b/net/core/filter.c index 82edad58d066..c468e7cfad19 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1402,7 +1402,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, { int err = __bpf_try_make_writable(skb, write_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return err; } @@ -1962,7 +1962,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -1984,7 +1984,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2178,7 +2178,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2303,7 +2303,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : bpf_skb_net_grow(skb, len_diff_abs); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2394,7 +2394,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, skb_gso_reset(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2434,7 +2434,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_header(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return 0; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 1307731ddfe4..e7e626fb87bb 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, */ preempt_disable(); rcu_read_lock(); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); rcu_read_unlock(); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c0c707eb2c96..5ef8ce8c83d4 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 520c5027646a..36671b0fb125 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -99,11 +99,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } -- cgit v1.2.3 From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:51 +0200 Subject: bpf: add meta pointer for direct access This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/bpf/test_run.c | 1 + net/core/dev.c | 31 ++++++++++++++++++---- net/core/filter.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++--- net/core/skbuff.c | 2 ++ 4 files changed, 102 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index df672517b4fd..a86e6687026e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data_hard_start = data; xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; retval = bpf_test_run(prog, &xdp, repeat, &duration); diff --git a/net/core/dev.c b/net/core/dev.c index 97abddd9039a..e350c768d4b5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3864,8 +3864,8 @@ drop: static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + u32 metalen, act = XDP_DROP; struct xdp_buff xdp; - u32 act = XDP_DROP; void *orig_data; int hlen, off; u32 mac_len; @@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_cloned(skb)) return XDP_PASS; - if (skb_linearize(skb)) - goto do_drop; + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (troom > 0 && __skb_linearize(skb)) + goto do_drop; + } /* The XDP program wants to see the packet starting at the MAC * header. @@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; xdp.data = skb->data - mac_len; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + hlen; xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; @@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); - /* fall through */ + break; case XDP_PASS: + metalen = xdp.data - xdp.data_meta; + if (metalen) + skb_metadata_set(skb, metalen); break; - default: bpf_warn_invalid_xdp_action(act); /* fall through */ @@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); diff --git a/net/core/filter.c b/net/core/filter.c index c468e7cfad19..9b6e7e84aafd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2447,14 +2447,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) +{ + return xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; +} + BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + unsigned long metalen = xdp_get_metalen(xdp); + void *data_start = xdp->data_hard_start + metalen; void *data = xdp->data + offset; - if (unlikely(data < xdp->data_hard_start || + if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; + if (metalen) + memmove(xdp->data_meta + offset, + xdp->data_meta, metalen); + xdp->data_meta += offset; xdp->data = data; return 0; @@ -2468,6 +2480,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) +{ + void *meta = xdp->data_meta + offset; + unsigned long metalen = xdp->data - meta; + + if (xdp_data_meta_unsupported(xdp)) + return -ENOTSUPP; + if (unlikely(meta < xdp->data_hard_start || + meta > xdp->data)) + return -EINVAL; + if (unlikely((metalen & (sizeof(__u32) - 1)) || + (metalen > 32))) + return -EACCES; + + xdp->data_meta = meta; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { + .func = bpf_xdp_adjust_meta, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static int __bpf_tx_xdp(struct net_device *dev, struct bpf_map *map, struct xdp_buff *xdp, @@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head) + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta) return true; return false; @@ -3288,6 +3328,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; + case BPF_FUNC_xdp_adjust_meta: + return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: @@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; @@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; @@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, data_meta): return false; } @@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; + case bpf_ctx_range(struct __sk_buff, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size, case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; + case offsetof(struct xdp_md, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + return false; + } + if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): @@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size, } switch (off) { - case bpf_ctx_range(struct __sk_buff, tc_classid): - return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; @@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, offsetof(struct sk_buff, data)); break; + case offsetof(struct __sk_buff, data_meta): + off = si->off; + off -= offsetof(struct __sk_buff, data_meta); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_meta); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); @@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; + case offsetof(struct xdp_md, data_meta): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_meta)); + break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 000ce735fa8d..d98c2e3ce2bf 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + skb_metadata_clear(skb); + /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). -- cgit v1.2.3 From f4344e0a48121d07cf8f69415278e84c39dbe9bf Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:31 -0400 Subject: net: dsa: return -ENODEV is there is no slave PHY Instead of returning -EOPNOTSUPP when a slave device has no PHY, directly return -ENODEV as ethtool and phylib do. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index bd51ef56ec5b..79c5a0cd9923 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -266,10 +266,10 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); - if (p->phy != NULL) - return phy_mii_ioctl(p->phy, ifr, cmd); + if (!p->phy) + return -ENODEV; - return -EOPNOTSUPP; + return phy_mii_ioctl(p->phy, ifr, cmd); } static int dsa_slave_port_attr_set(struct net_device *dev, @@ -429,7 +429,7 @@ dsa_slave_get_link_ksettings(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); if (!p->phy) - return -EOPNOTSUPP; + return -ENODEV; phy_ethtool_ksettings_get(p->phy, cmd); @@ -442,10 +442,10 @@ dsa_slave_set_link_ksettings(struct net_device *dev, { struct dsa_slave_priv *p = netdev_priv(dev); - if (p->phy != NULL) - return phy_ethtool_ksettings_set(p->phy, cmd); + if (!p->phy) + return -ENODEV; - return -EOPNOTSUPP; + return phy_ethtool_ksettings_set(p->phy, cmd); } static void dsa_slave_get_drvinfo(struct net_device *dev, @@ -481,22 +481,22 @@ static int dsa_slave_nway_reset(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - if (p->phy != NULL) - return genphy_restart_aneg(p->phy); + if (!p->phy) + return -ENODEV; - return -EOPNOTSUPP; + return genphy_restart_aneg(p->phy); } static u32 dsa_slave_get_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - if (p->phy != NULL) { - genphy_update_link(p->phy); - return p->phy->link; - } + if (!p->phy) + return -ENODEV; - return -EOPNOTSUPP; + genphy_update_link(p->phy); + + return p->phy->link; } static int dsa_slave_get_eeprom_len(struct net_device *dev) -- cgit v1.2.3 From 0115dcd1787d2d1c50719fab98d6bcb0c17931a1 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:32 -0400 Subject: net: dsa: use slave device phydev There is no need to store a phy_device in dsa_slave_priv since net_device already provides one. Simply s/p->phy/dev->phydev/. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 1 - net/dsa/slave.c | 116 ++++++++++++++++++++++++----------------------------- 2 files changed, 53 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 0298a0f6a349..eccc62776283 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -79,7 +79,6 @@ struct dsa_slave_priv { * The phylib phy_device pointer for the PHY connected * to this port. */ - struct phy_device *phy; phy_interface_t phy_interface; int old_link; int old_pause; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 79c5a0cd9923..4ea1c6eb0da8 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -96,12 +96,12 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - err = dsa_port_enable(dp, p->phy); + err = dsa_port_enable(dp, dev->phydev); if (err) goto clear_promisc; - if (p->phy) - phy_start(p->phy); + if (dev->phydev) + phy_start(dev->phydev); return 0; @@ -124,10 +124,10 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_master_netdev(p); struct dsa_port *dp = p->dp; - if (p->phy) - phy_stop(p->phy); + if (dev->phydev) + phy_stop(dev->phydev); - dsa_port_disable(dp, p->phy); + dsa_port_disable(dp, dev->phydev); dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); @@ -264,12 +264,10 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - return phy_mii_ioctl(p->phy, ifr, cmd); + return phy_mii_ioctl(dev->phydev, ifr, cmd); } static int dsa_slave_port_attr_set(struct net_device *dev, @@ -426,12 +424,10 @@ static int dsa_slave_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - phy_ethtool_ksettings_get(p->phy, cmd); + phy_ethtool_ksettings_get(dev->phydev, cmd); return 0; } @@ -440,12 +436,10 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - return phy_ethtool_ksettings_set(p->phy, cmd); + return phy_ethtool_ksettings_set(dev->phydev, cmd); } static void dsa_slave_get_drvinfo(struct net_device *dev, @@ -479,24 +473,20 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) static int dsa_slave_nway_reset(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - return genphy_restart_aneg(p->phy); + return genphy_restart_aneg(dev->phydev); } static u32 dsa_slave_get_link(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - genphy_update_link(p->phy); + genphy_update_link(dev->phydev); - return p->phy->link; + return dev->phydev->link; } static int dsa_slave_get_eeprom_len(struct net_device *dev) @@ -631,7 +621,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!p->phy) + if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) @@ -642,12 +632,12 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) return ret; if (e->eee_enabled) { - ret = phy_init_eee(p->phy, 0); + ret = phy_init_eee(dev->phydev, 0); if (ret) return ret; } - return phy_ethtool_set_eee(p->phy, e); + return phy_ethtool_set_eee(dev->phydev, e); } static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) @@ -657,7 +647,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!p->phy) + if (!dev->phydev) return -ENODEV; if (!ds->ops->get_mac_eee) @@ -667,7 +657,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) if (ret) return ret; - return phy_ethtool_get_eee(p->phy, e); + return phy_ethtool_get_eee(dev->phydev, e); } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -976,26 +966,26 @@ static void dsa_slave_adjust_link(struct net_device *dev) struct dsa_switch *ds = p->dp->ds; unsigned int status_changed = 0; - if (p->old_link != p->phy->link) { + if (p->old_link != dev->phydev->link) { status_changed = 1; - p->old_link = p->phy->link; + p->old_link = dev->phydev->link; } - if (p->old_duplex != p->phy->duplex) { + if (p->old_duplex != dev->phydev->duplex) { status_changed = 1; - p->old_duplex = p->phy->duplex; + p->old_duplex = dev->phydev->duplex; } - if (p->old_pause != p->phy->pause) { + if (p->old_pause != dev->phydev->pause) { status_changed = 1; - p->old_pause = p->phy->pause; + p->old_pause = dev->phydev->pause; } if (ds->ops->adjust_link && status_changed) - ds->ops->adjust_link(ds, p->dp->index, p->phy); + ds->ops->adjust_link(ds, p->dp->index, dev->phydev); if (status_changed) - phy_print_status(p->phy); + phy_print_status(dev->phydev); } static int dsa_slave_fixed_link_update(struct net_device *dev, @@ -1020,17 +1010,18 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) struct dsa_slave_priv *p = netdev_priv(slave_dev); struct dsa_switch *ds = p->dp->ds; - p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); - if (!p->phy) { + slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr); + if (!slave_dev->phydev) { netdev_err(slave_dev, "no phy at %d\n", addr); return -ENODEV; } /* Use already configured phy mode */ if (p->phy_interface == PHY_INTERFACE_MODE_NA) - p->phy_interface = p->phy->interface; - return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, - p->phy_interface); + p->phy_interface = slave_dev->phydev->interface; + + return phy_connect_direct(slave_dev, slave_dev->phydev, + dsa_slave_adjust_link, p->phy_interface); } static int dsa_slave_phy_setup(struct net_device *slave_dev) @@ -1082,22 +1073,23 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } } else { - p->phy = of_phy_connect(slave_dev, phy_dn, - dsa_slave_adjust_link, - phy_flags, - p->phy_interface); + slave_dev->phydev = of_phy_connect(slave_dev, phy_dn, + dsa_slave_adjust_link, + phy_flags, + p->phy_interface); } of_node_put(phy_dn); } - if (p->phy && phy_is_fixed) - fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update); + if (slave_dev->phydev && phy_is_fixed) + fixed_phy_set_link_update(slave_dev->phydev, + dsa_slave_fixed_link_update); /* We could not connect to a designated PHY, so use the switch internal * MDIO bus instead */ - if (!p->phy) { + if (!slave_dev->phydev) { ret = dsa_slave_phy_connect(slave_dev, p->dp->index); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", @@ -1108,7 +1100,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) } } - phy_attached_info(p->phy); + phy_attached_info(slave_dev->phydev); return 0; } @@ -1128,12 +1120,12 @@ int dsa_slave_suspend(struct net_device *slave_dev) netif_device_detach(slave_dev); - if (p->phy) { - phy_stop(p->phy); + if (slave_dev->phydev) { + phy_stop(slave_dev->phydev); p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; - phy_suspend(p->phy); + phy_suspend(slave_dev->phydev); } return 0; @@ -1141,13 +1133,11 @@ int dsa_slave_suspend(struct net_device *slave_dev) int dsa_slave_resume(struct net_device *slave_dev) { - struct dsa_slave_priv *p = netdev_priv(slave_dev); - netif_device_attach(slave_dev); - if (p->phy) { - phy_resume(p->phy); - phy_start(p->phy); + if (slave_dev->phydev) { + phy_resume(slave_dev->phydev); + phy_start(slave_dev->phydev); } return 0; @@ -1240,8 +1230,8 @@ void dsa_slave_destroy(struct net_device *slave_dev) port_dn = p->dp->dn; netif_carrier_off(slave_dev); - if (p->phy) { - phy_disconnect(p->phy); + if (slave_dev->phydev) { + phy_disconnect(slave_dev->phydev); if (of_phy_is_fixed_link(port_dn)) of_phy_deregister_fixed_link(port_dn); -- cgit v1.2.3 From 771df31ace8a0264bcc3576d3f02660e3366cd6d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:33 -0400 Subject: net: dsa: use phy_ethtool_get_link_ksettings Use phy_ethtool_get_link_ksettings now that dsa_slave_get_link_ksettings does exactly the same. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4ea1c6eb0da8..bb0f64f47ae7 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -420,17 +420,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) } /* ethtool operations *******************************************************/ -static int -dsa_slave_get_link_ksettings(struct net_device *dev, - struct ethtool_link_ksettings *cmd) -{ - if (!dev->phydev) - return -ENODEV; - - phy_ethtool_ksettings_get(dev->phydev, cmd); - - return 0; -} static int dsa_slave_set_link_ksettings(struct net_device *dev, @@ -921,8 +910,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, .get_eee = dsa_slave_get_eee, - .get_link_ksettings = dsa_slave_get_link_ksettings, .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_link_ksettings = phy_ethtool_get_link_ksettings, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, }; -- cgit v1.2.3 From aa62a8ca85cd05b4af4d7be70ecb735f65b0449a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:34 -0400 Subject: net: dsa: use phy_ethtool_set_link_ksettings Use phy_ethtool_set_link_ksettings now that dsa_slave_set_link_ksettings does exactly the same. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index bb0f64f47ae7..d2b632cae468 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -421,16 +421,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) /* ethtool operations *******************************************************/ -static int -dsa_slave_set_link_ksettings(struct net_device *dev, - const struct ethtool_link_ksettings *cmd) -{ - if (!dev->phydev) - return -ENODEV; - - return phy_ethtool_ksettings_set(dev->phydev, cmd); -} - static void dsa_slave_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { @@ -910,8 +900,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, .get_eee = dsa_slave_get_eee, - .set_link_ksettings = dsa_slave_set_link_ksettings, .get_link_ksettings = phy_ethtool_get_link_ksettings, + .set_link_ksettings = phy_ethtool_set_link_ksettings, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, }; -- cgit v1.2.3 From 69b2c1629649b1e765516c7b452797b9697de9ac Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:35 -0400 Subject: net: dsa: use phy_ethtool_nway_reset Use phy_ethtool_nway_reset now that dsa_slave_nway_reset does exactly the same. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d2b632cae468..bf8800de13c1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -450,14 +450,6 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) ds->ops->get_regs(ds, p->dp->index, regs, _p); } -static int dsa_slave_nway_reset(struct net_device *dev) -{ - if (!dev->phydev) - return -ENODEV; - - return genphy_restart_aneg(dev->phydev); -} - static u32 dsa_slave_get_link(struct net_device *dev) { if (!dev->phydev) @@ -888,7 +880,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, .get_regs = dsa_slave_get_regs, - .nway_reset = dsa_slave_nway_reset, + .nway_reset = phy_ethtool_nway_reset, .get_link = dsa_slave_get_link, .get_eeprom_len = dsa_slave_get_eeprom_len, .get_eeprom = dsa_slave_get_eeprom, -- cgit v1.2.3 From 310ebbba3b7396b00bce08a33f1d2de2c74fa257 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:12 +0200 Subject: ipmr: Add reference count to MFC entries Next commits will introduce MFC notifications through the atomic fib_notification chain, thus allowing modules to be aware of MFC entries. Due to the fact that modules may need to hold a reference to an MFC entry, add reference count to MFC entries to prevent them from being freed while these modules use them. The reference counting is done only on resolved MFC entries currently. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c9b3e6e069ae..86dc5f98c5dd 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -652,10 +652,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head) kmem_cache_free(mrt_cachep, c); } -static inline void ipmr_cache_free(struct mfc_cache *c) +void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->rcu, ipmr_cache_free_rcu); } +EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -949,6 +950,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) if (c) { c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->mfc_un.res.minvif = MAXVIFS; + refcount_set(&c->mfc_un.res.refcount, 1); } return c; } @@ -1162,7 +1164,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); return 0; } @@ -1264,7 +1266,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { -- cgit v1.2.3 From 4d65b9487831170e699b2fc64a91b839d729bd78 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:13 +0200 Subject: ipmr: Add FIB notification access functions Make the ipmr module register as a FIB notifier. To do that, implement both the ipmr_seq_read and ipmr_dump ops. The ipmr_seq_read op returns a sequence counter that is incremented on every notification related operation done by the ipmr. To implement that, add a sequence counter in the netns_ipv4 struct and increment it whenever a new MFC route or VIF are added or deleted. The sequence operations are protected by the RTNL lock. The ipmr_dump iterates the list of MFC routes and the list of VIF entries and sends notifications about them. The entries dump is done under RCU where the VIF dump uses the mrt_lock too, as the vif->dev field can change under RCU. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 86dc5f98c5dd..49879c338357 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -264,6 +264,16 @@ static void __net_exit ipmr_rules_exit(struct net *net) fib_rules_unregister(net->ipv4.mr_rules_ops); rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); +} #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) @@ -298,6 +308,16 @@ static void __net_exit ipmr_rules_exit(struct net *net) net->ipv4.mrt = NULL; rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return 0; +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return 0; +} #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, @@ -587,6 +607,43 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif +static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call @@ -3050,14 +3107,87 @@ static const struct net_protocol pim_protocol = { }; #endif +static unsigned int ipmr_seq_read(struct net *net) +{ + ASSERT_RTNL(); + + return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); +} + +static int ipmr_dump(struct net *net, struct notifier_block *nb) +{ + struct mr_table *mrt; + int err; + + err = ipmr_rules_dump(net, nb); + if (err) + return err; + + ipmr_for_each_table(mrt, net) { + struct vif_device *v = &mrt->vif_table[0]; + struct mfc_cache *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(&mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(&mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + call_ipmr_mfc_entry_notifier(nb, net, + FIB_EVENT_ENTRY_ADD, mfc, + mrt->id); + } + + return 0; +} + +static const struct fib_notifier_ops ipmr_notifier_ops_template = { + .family = RTNL_FAMILY_IPMR, + .fib_seq_read = ipmr_seq_read, + .fib_dump = ipmr_dump, + .owner = THIS_MODULE, +}; + +int __net_init ipmr_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + net->ipv4.ipmr_seq = 0; + + ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv4.ipmr_notifier_ops = ops; + + return 0; +} + +static void __net_exit ipmr_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); + net->ipv4.ipmr_notifier_ops = NULL; +} + /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; + err = ipmr_notifier_init(net); + if (err) + goto ipmr_notifier_fail; + err = ipmr_rules_init(net); if (err < 0) - goto fail; + goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -3074,7 +3204,9 @@ proc_cache_fail: proc_vif_fail: ipmr_rules_exit(net); #endif -fail: +ipmr_rules_fail: + ipmr_notifier_exit(net); +ipmr_notifier_fail: return err; } @@ -3084,6 +3216,7 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif + ipmr_notifier_exit(net); ipmr_rules_exit(net); } -- cgit v1.2.3 From b362053a7cc0fcc09b92642ba5dd1ca7fddc9004 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:14 +0200 Subject: ipmr: Send FIB notifications on MFC and VIF entries Use the newly introduced notification chain to send events upon VIF and MFC addition and deletion. The MFC notifications are sent only on resolved MFC entries, as unresolved cannot be offloaded. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 49879c338357..ba71bc402336 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -627,6 +627,27 @@ static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, return call_fib_notifier(nb, net, event_type, &info.info); } +static int call_ipmr_vif_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, @@ -644,6 +665,24 @@ static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, return call_fib_notifier(nb, net, event_type, &info.info); } +static int call_ipmr_mfc_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call @@ -651,6 +690,7 @@ static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { + struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; @@ -660,6 +700,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; + if (VIF_EXISTS(mrt, vifi)) + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi, + mrt->id); + write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; @@ -909,6 +953,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, if (vifi+1 > mrt->maxvif) mrt->maxvif = vifi+1; write_unlock_bh(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); return 0; } @@ -1209,6 +1254,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ @@ -1220,6 +1266,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_cache_put(c); @@ -1248,6 +1295,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1297,6 +1346,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1304,6 +1354,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, bool all) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c, *tmp; LIST_HEAD(list); int i; @@ -1322,6 +1373,8 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_cache_put(c); } -- cgit v1.2.3 From c7c0bbeae9501a7e42f2fd306d6a6399aca688b6 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:15 +0200 Subject: net: ipmr: Add MFC offload indication Allow drivers, registered to the fib notification chain indicate whether a multicast MFC route is offloaded or not, similarly to unicast routes. The indication of whether a route is offloaded is done using the mfc_flags field on an mfc_cache struct, and the information is sent to the userspace via the RTNetlink interface only. Currently, MFC routes are either offloaded or not, thus there is no need to add per-VIF offload indication. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ba71bc402336..2a795d2c0502 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2268,6 +2268,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) return -EMSGSIZE; + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) return -EMSGSIZE; -- cgit v1.2.3 From 478e4c2f0067d57d7c17059caafab026ca32084a Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:16 +0200 Subject: net: mroute: Check if rule is a default rule When the ipmr starts, it adds one default FIB rule that matches all packets and sends them to the DEFAULT (multicast) FIB table. A more complex rule can be added by user to specify that for a specific interface, a packet should be look up at either an arbitrary table or according to the l3mdev of the interface. For drivers willing to offload the ipmr logic into a hardware but don't want to offload all the FIB rules functionality, provide a function that can indicate whether the FIB rule is the default multicast rule, thus only one routing table is needed. This way, a driver can register to the FIB notification chain, get notifications about FIB rules added and trigger some kind of an internal abort mechanism when a non default rule is added by the user. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2a795d2c0502..292a8e80bdfa 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -274,6 +274,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); } + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; +} +EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) @@ -318,6 +324,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net) { return 0; } + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return true; +} +EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, -- cgit v1.2.3 From 7e7c1afb67074596f6ff96e5276d6084d1648ec1 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Thu, 28 Sep 2017 13:44:39 +0200 Subject: batman-adv: Start new development cycle Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 05cc7637c064..edb2f239d04d 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2017.3" +#define BATADV_SOURCE_VERSION "2017.4" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3 From 825ffe1f7b875127bc03faffec0ecfb05906650a Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 23 Aug 2017 21:52:13 +0200 Subject: batman-adv: Remove unnecessary parentheses checkpatch introduced with commit 63b7c73ec86b ("checkpatch: add --strict check for ifs with unnecessary parentheses") an additional test which identifies some unnecessary parentheses. Remove these unnecessary parentheses to avoid the warnings and to unify the coding style slightly more. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 24 ++++++++++++------------ net/batman-adv/bat_v.c | 2 +- net/batman-adv/bat_v_elp.c | 6 +++--- net/batman-adv/bat_v_ogm.c | 12 ++++++------ net/batman-adv/distributed-arp-table.c | 4 ++-- net/batman-adv/gateway_client.c | 8 ++++---- net/batman-adv/gateway_common.c | 18 +++++++++--------- net/batman-adv/hard-interface.c | 12 ++++++------ net/batman-adv/icmp_socket.c | 4 ++-- net/batman-adv/main.c | 4 ++-- net/batman-adv/multicast.c | 2 +- net/batman-adv/originator.c | 26 +++++++++++++------------- net/batman-adv/routing.c | 6 +++--- net/batman-adv/send.c | 6 +++--- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/sysfs.c | 4 ++-- net/batman-adv/tp_meter.c | 2 +- 17 files changed, 71 insertions(+), 71 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 83ba5483455a..1b659ab652fb 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -916,8 +916,8 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) u16 tvlv_len = 0; unsigned long send_time; - if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || - (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || + hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) return; /* the interface gets activated here to avoid race conditions between @@ -1264,7 +1264,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, * drops as they can't send and receive at the same time. */ tq_iface_penalty = BATADV_TQ_MAX_VALUE; - if (if_outgoing && (if_incoming == if_outgoing) && + if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, bat_priv); @@ -1369,7 +1369,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, ret = BATADV_NEIGH_DUP; } else { set_mark = 0; - if (is_dup && (ret != BATADV_NEIGH_DUP)) + if (is_dup && ret != BATADV_NEIGH_DUP) ret = BATADV_ORIG_DUP; } @@ -1515,7 +1515,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ - if (!is_single_hop_neigh && (!orig_neigh_router)) { + if (!is_single_hop_neigh && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out_neigh; @@ -1535,7 +1535,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno); similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl; - if (is_bidirect && ((dup_status == BATADV_NO_DUP) || + if (is_bidirect && (dup_status == BATADV_NO_DUP || (sameseq && similar_ttl))) { batadv_iv_ogm_orig_update(bat_priv, orig_node, orig_ifinfo, ethhdr, @@ -1553,8 +1553,8 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, /* OGMs from secondary interfaces should only scheduled once * per interface where it has been received, not multiple times */ - if ((ogm_packet->ttl <= 2) && - (if_incoming != if_outgoing)) { + if (ogm_packet->ttl <= 2 && + if_incoming != if_outgoing) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM from secondary interface and wrong outgoing interface\n"); goto out_neigh; @@ -1590,7 +1590,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, if_incoming, if_outgoing); out_neigh: - if ((orig_neigh_node) && (!is_single_hop_neigh)) + if (orig_neigh_node && !is_single_hop_neigh) batadv_orig_node_put(orig_neigh_node); out: if (router_ifinfo) @@ -2523,9 +2523,9 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) tmp_gw_factor *= 100 * 100; tmp_gw_factor >>= 18; - if ((tmp_gw_factor > max_gw_factor) || - ((tmp_gw_factor == max_gw_factor) && - (tq_avg > max_tq))) { + if (tmp_gw_factor > max_gw_factor || + (tmp_gw_factor == max_gw_factor && + tq_avg > max_tq)) { if (curr_gw) batadv_gw_node_put(curr_gw); curr_gw = gw_node; diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 4e2724c5b33d..93ef1c06227e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -767,7 +767,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (batadv_v_gw_throughput_get(gw_node, &bw) < 0) goto next; - if (curr_gw && (bw <= max_bw)) + if (curr_gw && bw <= max_bw) goto next; if (curr_gw) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index bd1064d98e16..1de992c58b35 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -134,7 +134,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; throughput = link_settings.base.speed; - if (throughput && (throughput != SPEED_UNKNOWN)) + if (throughput && throughput != SPEED_UNKNOWN) return throughput * 10; } @@ -263,8 +263,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) goto out; /* we are in the process of shutting this interface down */ - if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || - (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || + hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) goto out; /* the interface was enabled but may not be ready yet */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 8be61734fc43..c251445a42a0 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -304,8 +304,8 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, * due to the store & forward characteristics of WIFI. * Very low throughput values are the exception. */ - if ((throughput > 10) && - (if_incoming == if_outgoing) && + if (throughput > 10 && + if_incoming == if_outgoing && !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX)) return throughput / 2; @@ -455,7 +455,7 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, /* drop packets with old seqnos, however accept the first packet after * a host has been rebooted. */ - if ((seq_diff < 0) && !protection_started) + if (seq_diff < 0 && !protection_started) goto out; neigh_node->last_seen = jiffies; @@ -568,8 +568,8 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, router_throughput = router_ifinfo->bat_v.throughput; neigh_throughput = neigh_ifinfo->bat_v.throughput; - if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) && - (router_throughput >= neigh_throughput)) + if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF && + router_throughput >= neigh_throughput) goto out; } @@ -621,7 +621,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, return; /* only unknown & newer OGMs contain TVLVs we are interested in */ - if ((seqno_age > 0) && (if_outgoing == BATADV_IF_DEFAULT)) + if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL, (unsigned char *)(ogm2 + 1), diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b6cfa78e9381..760c0de72582 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -492,8 +492,8 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, /* this is an hash collision with the temporary selected node. Choose * the one with the lowest address */ - if ((tmp_max == max) && max_orig_node && - (batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0)) + if (tmp_max == max && max_orig_node && + batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0) goto out; ret = true; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index de9955d5224d..10d521f0b17f 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -248,12 +248,12 @@ void batadv_gw_election(struct batadv_priv *bat_priv) } } - if ((curr_gw) && (!next_gw)) { + if (curr_gw && !next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Removing selected gateway - no gateway in range\n"); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); - } else if ((!curr_gw) && (next_gw)) { + } else if (!curr_gw && next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n", next_gw->orig_node->orig, @@ -411,8 +411,8 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, goto out; } - if ((gw_node->bandwidth_down == ntohl(gateway->bandwidth_down)) && - (gw_node->bandwidth_up == ntohl(gateway->bandwidth_up))) + if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) && + gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 33940c5c74a8..2c26039c23fc 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -56,8 +56,8 @@ bool batadv_parse_throughput(struct net_device *net_dev, char *buff, if (strncasecmp(tmp_ptr, "mbit", 4) == 0) bw_unit_type = BATADV_BW_UNIT_MBIT; - if ((strncasecmp(tmp_ptr, "kbit", 4) == 0) || - (bw_unit_type == BATADV_BW_UNIT_MBIT)) + if (strncasecmp(tmp_ptr, "kbit", 4) == 0 || + bw_unit_type == BATADV_BW_UNIT_MBIT) *tmp_ptr = '\0'; } @@ -190,7 +190,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, if (!up_new) up_new = 1; - if ((down_curr == down_new) && (up_curr == up_new)) + if (down_curr == down_new && up_curr == up_new) return count; batadv_gw_reselect(bat_priv); @@ -224,16 +224,16 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /* only fetch the tvlv value if the handler wasn't called via the * CIFNOTFND flag and if there is data to fetch */ - if ((flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) || - (tvlv_value_len < sizeof(gateway))) { + if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND || + tvlv_value_len < sizeof(gateway)) { gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; } else { gateway_ptr = tvlv_value; gateway.bandwidth_down = gateway_ptr->bandwidth_down; gateway.bandwidth_up = gateway_ptr->bandwidth_up; - if ((gateway.bandwidth_down == 0) || - (gateway.bandwidth_up == 0)) { + if (gateway.bandwidth_down == 0 || + gateway.bandwidth_up == 0) { gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; } @@ -242,8 +242,8 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, batadv_gw_node_update(bat_priv, orig, &gateway); /* restart gateway selection */ - if ((gateway.bandwidth_down != 0) && - (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT)) + if (gateway.bandwidth_down != 0 && + atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) batadv_gw_check_election(bat_priv, orig); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index e348f76ea8c1..d4aa99c060f9 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -504,8 +504,8 @@ static void batadv_check_known_mac_addr(const struct net_device *net_dev) rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if ((hard_iface->if_status != BATADV_IF_ACTIVE) && - (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)) + if (hard_iface->if_status != BATADV_IF_ACTIVE && + hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->net_dev == net_dev) @@ -568,8 +568,8 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if ((hard_iface->if_status != BATADV_IF_ACTIVE) && - (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)) + if (hard_iface->if_status != BATADV_IF_ACTIVE && + hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->soft_iface != soft_iface) @@ -654,8 +654,8 @@ out: static void batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) { - if ((hard_iface->if_status != BATADV_IF_ACTIVE) && - (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)) + if (hard_iface->if_status != BATADV_IF_ACTIVE && + hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) return; hard_iface->if_status = BATADV_IF_INACTIVE; diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 8ead292886d1..bded31121d12 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -132,10 +132,10 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf, size_t packet_len; int error; - if ((file->f_flags & O_NONBLOCK) && (socket_client->queue_len == 0)) + if ((file->f_flags & O_NONBLOCK) && socket_client->queue_len == 0) return -EAGAIN; - if ((!buf) || (count < sizeof(struct batadv_icmp_packet))) + if (!buf || count < sizeof(struct batadv_icmp_packet)) return -EINVAL; if (!access_ok(VERIFY_WRITE, buf, count)) diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index fb381fb26a66..033819aefc39 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -544,8 +544,8 @@ batadv_recv_handler_register(u8 packet_type, struct batadv_hard_iface *); curr = batadv_rx_handler[packet_type]; - if ((curr != batadv_recv_unhandled_packet) && - (curr != batadv_recv_unhandled_unicast_packet)) + if (curr != batadv_recv_unhandled_packet && + curr != batadv_recv_unhandled_unicast_packet) return -EBUSY; batadv_rx_handler[packet_type] = recv_handler; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index d327670641ac..e553a8770a89 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1126,7 +1126,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, bool orig_initialized; if (orig_mcast_enabled && tvlv_value && - (tvlv_value_len >= sizeof(mcast_flags))) + tvlv_value_len >= sizeof(mcast_flags)) mcast_flags = *(u8 *)tvlv_value; spin_lock_bh(&orig->mcast_handler_lock); diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 8e2a4b205257..2967b86c13da 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1062,9 +1062,9 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, continue; /* don't purge if the interface is not (going) down */ - if ((if_outgoing->if_status != BATADV_IF_INACTIVE) && - (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) && - (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)) + if (if_outgoing->if_status != BATADV_IF_INACTIVE && + if_outgoing->if_status != BATADV_IF_NOT_IN_USE && + if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -1106,9 +1106,9 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, continue; /* don't purge if the interface is not (going) down */ - if ((if_outgoing->if_status != BATADV_IF_INACTIVE) && - (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) && - (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)) + if (if_outgoing->if_status != BATADV_IF_INACTIVE && + if_outgoing->if_status != BATADV_IF_NOT_IN_USE && + if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -1155,13 +1155,13 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, last_seen = neigh_node->last_seen; if_incoming = neigh_node->if_incoming; - if ((batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT)) || - (if_incoming->if_status == BATADV_IF_INACTIVE) || - (if_incoming->if_status == BATADV_IF_NOT_IN_USE) || - (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) { - if ((if_incoming->if_status == BATADV_IF_INACTIVE) || - (if_incoming->if_status == BATADV_IF_NOT_IN_USE) || - (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) + if (batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT) || + if_incoming->if_status == BATADV_IF_INACTIVE || + if_incoming->if_status == BATADV_IF_NOT_IN_USE || + if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) { + if (if_incoming->if_status == BATADV_IF_INACTIVE || + if_incoming->if_status == BATADV_IF_NOT_IN_USE || + if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "neighbor purge: originator %pM, neighbor: %pM, iface: %s\n", orig_node->orig, neigh_node->addr, diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index f10e3ff26f9d..40d9bf3e5bfe 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -93,14 +93,14 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, batadv_orig_ifinfo_put(orig_ifinfo); /* route deleted */ - if ((curr_router) && (!neigh_node)) { + if (curr_router && !neigh_node) { batadv_dbg(BATADV_DBG_ROUTES, bat_priv, "Deleting route towards: %pM\n", orig_node->orig); batadv_tt_global_del_orig(bat_priv, orig_node, -1, "Deleted route towards originator"); /* route added */ - } else if ((!curr_router) && (neigh_node)) { + } else if (!curr_router && neigh_node) { batadv_dbg(BATADV_DBG_ROUTES, bat_priv, "Adding route towards: %pM (via %pM)\n", orig_node->orig, neigh_node->addr); @@ -381,7 +381,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, /* add record route information if not full */ if ((icmph->msg_type == BATADV_ECHO_REPLY || icmph->msg_type == BATADV_ECHO_REQUEST) && - (skb->len >= sizeof(struct batadv_icmp_packet_rr))) { + skb->len >= sizeof(struct batadv_icmp_packet_rr)) { if (skb_linearize(skb) < 0) goto free_skb; diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 054a65e6eb68..7895323fd2a7 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -142,7 +142,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb, #ifdef CONFIG_BATMAN_ADV_BATMAN_V hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr); - if ((hardif_neigh) && (ret != NET_XMIT_DROP)) + if (hardif_neigh && ret != NET_XMIT_DROP) hardif_neigh->bat_v.last_unicast_tx = jiffies; if (hardif_neigh) @@ -615,8 +615,8 @@ batadv_forw_packet_list_steal(struct hlist_head *forw_list, * we delete only packets belonging to the given interface */ if (hard_iface && - (forw_packet->if_incoming != hard_iface) && - (forw_packet->if_outgoing != hard_iface)) + forw_packet->if_incoming != hard_iface && + forw_packet->if_outgoing != hard_iface) continue; hlist_del(&forw_packet->list); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index c2c986746d0b..7c8288245f80 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -160,7 +160,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { /* check ranges */ - if ((new_mtu < 68) || (new_mtu > batadv_hardif_min_mtu(dev))) + if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 0ae8b30e4eaa..aa187fd42475 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -925,8 +925,8 @@ static int batadv_store_mesh_iface_finish(struct net_device *net_dev, if (hard_iface->if_status == status_tmp) goto out; - if ((hard_iface->soft_iface) && - (strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0)) + if (hard_iface->soft_iface && + strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0) goto out; if (status_tmp == BATADV_IF_NOT_IN_USE) { diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index bfe8effe9238..4b90033f35a8 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1206,7 +1206,7 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst, /* send the ack */ r = batadv_send_skb_to_orig(skb, orig_node, NULL); - if (unlikely(r < 0) || (r == NET_XMIT_DROP)) { + if (unlikely(r < 0) || r == NET_XMIT_DROP) { ret = BATADV_TP_REASON_DST_UNREACHABLE; goto out; } -- cgit v1.2.3 From 48915aed60c9615e5eaa0f1683640635dade788e Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 28 Sep 2017 17:16:51 +0200 Subject: batman-adv: Fix "line over 80 characters" checkpatch warning Fixes: 242c1a28eb61 ("net: Remove useless function skb_header_release") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/soft-interface.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 7c8288245f80..3af4b0b29b23 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -69,8 +69,8 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) int result; /* TODO: We must check if we can release all references to non-payload - * data using __skb_header_release in our skbs to allow skb_cow_header to - * work optimally. This means that those skbs are not allowed to read + * data using __skb_header_release in our skbs to allow skb_cow_header + * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. -- cgit v1.2.3 From 8f1975e31d8ed0c021a1993a4d9123dd39c740ea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Sep 2017 09:14:14 -0700 Subject: inetpeer: speed up inetpeer_invalidate_tree() As measured in my prior patch ("sch_netem: faster rb tree removal"), rbtree_postorder_for_each_entry_safe() is nice looking but much slower than using rb_next() directly, except when tree is small enough to fit in CPU caches (then the cost is the same) From: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..6e5626cc366c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -284,14 +284,17 @@ EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *p, *n; + struct rb_node *p = rb_first(&base->rb_root); - rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) { - inet_putpeer(p); + while (p) { + struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); + + p = rb_next(p); + rb_erase(&peer->rb_node, &base->rb_root); + inet_putpeer(peer); cond_resched(); } - base->rb_root = RB_ROOT; base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); -- cgit v1.2.3 From 76cf546c2802f6e25113ba481d7e85d0298768c6 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 25 Sep 2017 10:13:49 -0700 Subject: net_sched: use idr to allocate bpf filter handles Instead of calling cls_bpf_get() in a loop to find a unused handle, just switch to idr API to allocate new handles. Cc: Daniel Borkmann Cc: Chris Mi Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 57 ++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 36671b0fb125..6c6b21f6ba62 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -32,7 +33,7 @@ MODULE_DESCRIPTION("TC BPF based classifier"); struct cls_bpf_head { struct list_head plist; - u32 hgen; + struct idr handle_idr; struct rcu_head rcu; }; @@ -238,6 +239,7 @@ static int cls_bpf_init(struct tcf_proto *tp) return -ENOBUFS; INIT_LIST_HEAD_RCU(&head->plist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; @@ -264,6 +266,9 @@ static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) { + struct cls_bpf_head *head = rtnl_dereference(tp->root); + + idr_remove_ext(&head->handle_idr, prog->handle); cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); @@ -287,6 +292,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp) list_for_each_entry_safe(prog, tmp, &head->plist, link) __cls_bpf_delete(tp, prog); + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -421,27 +427,6 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, return 0; } -static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, - struct cls_bpf_head *head) -{ - unsigned int i = 0x80000000; - u32 handle; - - do { - if (++head->hgen == 0x7FFFFFFF) - head->hgen = 1; - } while (--i > 0 && cls_bpf_get(tp, head->hgen)); - - if (unlikely(i == 0)) { - pr_err("Insufficient number of handles\n"); - handle = 0; - } else { - handle = head->hgen; - } - - return handle; -} - static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -451,6 +436,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct cls_bpf_prog *oldprog = *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; struct cls_bpf_prog *prog; + unsigned long idr_index; int ret; if (tca[TCA_OPTIONS] == NULL) @@ -476,21 +462,30 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, } } - if (handle == 0) - prog->handle = cls_bpf_grab_new_handle(tp, head); - else + if (handle == 0) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (ret) + goto errout; + prog->handle = idr_index; + } else { + if (!oldprog) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (ret) + goto errout; + } prog->handle = handle; - if (prog->handle == 0) { - ret = -EINVAL; - goto errout; } ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr); if (ret < 0) - goto errout; + goto errout_idr; ret = cls_bpf_offload(tp, prog, oldprog); if (ret) { + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); __cls_bpf_delete_prog(prog); return ret; } @@ -499,6 +494,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; if (oldprog) { + idr_replace_ext(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); @@ -509,6 +505,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = prog; return 0; +errout_idr: + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); errout: tcf_exts_destroy(&prog->exts); kfree(prog); -- cgit v1.2.3 From 1d8134fea2eb460698a281f91c03c400f7e546ce Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 25 Sep 2017 10:13:50 -0700 Subject: net_sched: use idr to allocate basic filter handles Instead of calling basic_get() in a loop to find a unused handle, just switch to idr API to allocate new handles. Cc: Chris Mi Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_basic.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d89ebafd2239..cfeb6f158566 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -17,13 +17,14 @@ #include #include #include +#include #include #include #include struct basic_head { - u32 hgenerator; struct list_head flist; + struct idr handle_idr; struct rcu_head rcu; }; @@ -78,6 +79,7 @@ static int basic_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->flist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; } @@ -99,8 +101,10 @@ static void basic_destroy(struct tcf_proto *tp) list_for_each_entry_safe(f, n, &head->flist, link) { list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); call_rcu(&f->rcu, basic_delete_filter); } + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -111,6 +115,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last) list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); call_rcu(&f->rcu, basic_delete_filter); *last = list_empty(&head->flist); return 0; @@ -154,6 +159,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *fold = (struct basic_filter *) *arg; struct basic_filter *fnew; + unsigned long idr_index; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; @@ -179,30 +185,31 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, err = -EINVAL; if (handle) { fnew->handle = handle; - } else if (fold) { - fnew->handle = fold->handle; + if (!fold) { + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (err) + goto errout; + } } else { - unsigned int i = 0x80000000; - do { - if (++head->hgenerator == 0x7FFFFFFF) - head->hgenerator = 1; - } while (--i > 0 && basic_get(tp, head->hgenerator)); - - if (i <= 0) { - pr_err("Insufficient number of handles\n"); + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (err) goto errout; - } - - fnew->handle = head->hgenerator; + fnew->handle = idr_index; } err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr); - if (err < 0) + if (err < 0) { + if (!fold) + idr_remove_ext(&head->handle_idr, fnew->handle); goto errout; + } *arg = fnew; if (fold) { + idr_replace_ext(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->link, &fnew->link); tcf_unbind_filter(tp, &fold->res); call_rcu(&fold->rcu, basic_delete_filter); -- cgit v1.2.3 From e7614370d6f04711c4e4b48f7055e5008fa4ed42 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 25 Sep 2017 10:13:51 -0700 Subject: net_sched: use idr to allocate u32 filter handles Instead of calling u32_lookup_ht() in a loop to find a unused handle, just switch to idr API to allocate new handles. u32 filters are special as the handle could contain a hash table id and a key id, so we need two IDR to allocate each of them. Cc: Chris Mi Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 108 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 10b8d851fc6b..094d224411a9 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -46,6 +46,7 @@ #include #include #include +#include struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -82,6 +83,7 @@ struct tc_u_hnode { struct tc_u_common *tp_c; int refcnt; unsigned int divisor; + struct idr handle_idr; struct rcu_head rcu; /* The 'ht' field MUST be the last field in structure to allow for * more entries allocated at end of structure. @@ -93,7 +95,7 @@ struct tc_u_common { struct tc_u_hnode __rcu *hlist; struct Qdisc *q; int refcnt; - u32 hgenerator; + struct idr handle_idr; struct hlist_node hnode; struct rcu_head rcu; }; @@ -311,19 +313,19 @@ static void *u32_get(struct tcf_proto *tp, u32 handle) return u32_lookup_key(ht, handle); } -static u32 gen_new_htid(struct tc_u_common *tp_c) +static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) { - int i = 0x800; + unsigned long idr_index; + int err; - /* hgenerator only used inside rtnl lock it is safe to increment + /* This is only used inside rtnl lock it is safe to increment * without read _copy_ update semantics */ - do { - if (++tp_c->hgenerator == 0x7FF) - tp_c->hgenerator = 1; - } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); - - return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; + err = idr_alloc_ext(&tp_c->handle_idr, ptr, &idr_index, + 1, 0x7FF, GFP_KERNEL); + if (err) + return 0; + return (u32)(idr_index | 0x800) << 20; } static struct hlist_head *tc_u_common_hash; @@ -366,8 +368,9 @@ static int u32_init(struct tcf_proto *tp) return -ENOBUFS; root_ht->refcnt++; - root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; root_ht->prio = tp->prio; + idr_init(&root_ht->handle_idr); if (tp_c == NULL) { tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); @@ -377,6 +380,7 @@ static int u32_init(struct tcf_proto *tp) } tp_c->q = tp->q; INIT_HLIST_NODE(&tp_c->hnode); + idr_init(&tp_c->handle_idr); h = tc_u_hash(tp); hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]); @@ -565,6 +569,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n->handle); + idr_remove_ext(&ht->handle_idr, n->handle); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -586,6 +591,8 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { u32_clear_hw_hnode(tp, ht); + idr_destroy(&ht->handle_idr); + idr_remove_ext(&tp_c->handle_idr, ht->handle); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -633,6 +640,7 @@ static void u32_destroy(struct tcf_proto *tp) kfree_rcu(ht, rcu); } + idr_destroy(&tp_c->handle_idr); kfree(tp_c); } @@ -701,27 +709,21 @@ ret: return ret; } -#define NR_U32_NODE (1<<12) -static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid) { - struct tc_u_knode *n; - unsigned long i; - unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), - GFP_KERNEL); - if (!bitmap) - return handle | 0xFFF; - - for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]); - n; - n = rtnl_dereference(n->next)) - set_bit(TC_U32_NODE(n->handle), bitmap); - - i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); - if (i >= NR_U32_NODE) - i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); + unsigned long idr_index; + u32 start = htid | 0x800; + u32 max = htid | 0xFFF; + u32 min = htid; + + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + start, max + 1, GFP_KERNEL)) { + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + min + 1, max + 1, GFP_KERNEL)) + return max; + } - kfree(bitmap); - return handle | (i >= NR_U32_NODE ? 0xFFF : i); + return (u32)idr_index; } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -806,6 +808,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, if (pins->handle == n->handle) break; + idr_replace_ext(&ht->handle_idr, n, n->handle); RCU_INIT_POINTER(n->next, pins->next); rcu_assign_pointer(*ins, n); } @@ -937,22 +940,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; if (TC_U32_KEY(handle)) return -EINVAL; - if (handle == 0) { - handle = gen_new_htid(tp->data); - if (handle == 0) - return -ENOMEM; - } ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; + if (handle == 0) { + handle = gen_new_htid(tp->data, ht); + if (handle == 0) { + kfree(ht); + return -ENOMEM; + } + } else { + err = idr_alloc_ext(&tp_c->handle_idr, ht, NULL, + handle, handle + 1, GFP_KERNEL); + if (err) { + kfree(ht); + return err; + } + } ht->tp_c = tp_c; ht->refcnt = 1; ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; + idr_init(&ht->handle_idr); err = u32_replace_hw_hnode(tp, ht, flags); if (err) { + idr_remove_ext(&tp_c->handle_idr, handle); kfree(ht); return err; } @@ -986,24 +1000,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) return -EINVAL; handle = htid | TC_U32_NODE(handle); + err = idr_alloc_ext(&ht->handle_idr, NULL, NULL, + handle, handle + 1, + GFP_KERNEL); + if (err) + return err; } else handle = gen_new_kid(ht, htid); - if (tb[TCA_U32_SEL] == NULL) - return -EINVAL; + if (tb[TCA_U32_SEL] == NULL) { + err = -EINVAL; + goto erridr; + } s = nla_data(tb[TCA_U32_SEL]); n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); - if (n == NULL) - return -ENOBUFS; + if (n == NULL) { + err = -ENOBUFS; + goto erridr; + } #ifdef CONFIG_CLS_U32_PERF size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64); n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt)); if (!n->pf) { - kfree(n); - return -ENOBUFS; + err = -ENOBUFS; + goto errfree; } #endif @@ -1066,9 +1089,12 @@ errhw: errout: tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF +errfree: free_percpu(n->pf); #endif kfree(n); +erridr: + idr_remove_ext(&ht->handle_idr, handle); return err; } -- cgit v1.2.3 From 79110a0426d8179a51bf3cb698a84f6ec98ca60c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 13:58:40 +0200 Subject: rtnetlink: add helper to put master and link ifindexes rtnl_fill_ifinfo currently requires caller to hold the rtnl mutex. Unfortunately the function is quite large which makes it harder to see which spots require the lock, which spots assume it and which ones could do without. Add helpers to factor out the ifindex dumping, one can use rcu to avoid rtnl dependency. Reviewed-by: David Ahern Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a78fd61da0ec..c801212ee40e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1307,6 +1307,31 @@ static u32 rtnl_get_event(unsigned long event) return rtnl_event_type; } +static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) +{ + const struct net_device *upper_dev; + int ret = 0; + + rcu_read_lock(); + + upper_dev = netdev_master_upper_dev_get_rcu(dev); + if (upper_dev) + ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + + rcu_read_unlock(); + return ret; +} + +static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev) +{ + int ifindex = dev_get_iflink(dev); + + if (dev->ifindex == ifindex) + return 0; + + return nla_put_u32(skb, IFLA_LINK, ifindex); +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, @@ -1316,7 +1341,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct nlmsghdr *nlh; struct nlattr *af_spec; struct rtnl_af_ops *af_ops; - struct net_device *upper_dev = netdev_master_upper_dev_get(dev); ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1345,10 +1369,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif - (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || - (upper_dev && - nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || + nla_put_iflink(skb, dev) || + put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (dev->qdisc && nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || -- cgit v1.2.3 From 250fc3dfdbd3e8b5c751ce7b5a26176ec62ca0f8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 13:58:41 +0200 Subject: rtnetlink: add helpers to dump vf information similar to earlier patches, split out more parts of this function to better see what is happening and where we assume rtnl is locked. Reviewed-by: David Ahern Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 50 +++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c801212ee40e..d504e78cd01f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1211,6 +1211,36 @@ nla_put_vfinfo_failure: return -EMSGSIZE; } +static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, + struct net_device *dev, + u32 ext_filter_mask) +{ + struct nlattr *vfinfo; + int i, num_vfs; + + if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) + return 0; + + num_vfs = dev_num_vf(dev->dev.parent); + if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) + return -EMSGSIZE; + + if (!dev->netdev_ops->ndo_get_vf_config) + return 0; + + vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + return -EMSGSIZE; + + for (i = 0; i < num_vfs; i++) { + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) + return -EMSGSIZE; + } + + nla_nest_end(skb, vfinfo); + return 0; +} + static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_ifmap map; @@ -1407,27 +1437,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; - if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && - nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) + if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent && - ext_filter_mask & RTEXT_FILTER_VF) { - int i; - struct nlattr *vfinfo; - int num_vfs = dev_num_vf(dev->dev.parent); - - vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); - if (!vfinfo) - goto nla_put_failure; - for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) - goto nla_put_failure; - } - - nla_nest_end(skb, vfinfo); - } - if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; -- cgit v1.2.3 From b1e66b9a67d67d0e73091b04b51e524581c8c887 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 13:58:42 +0200 Subject: rtnetlink: add helpers to dump netnsid information Reviewed-by: David Ahern Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d504e78cd01f..d524609c587c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1362,6 +1362,23 @@ static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev) return nla_put_u32(skb, IFLA_LINK, ifindex); } +static int rtnl_fill_link_netnsid(struct sk_buff *skb, + const struct net_device *dev) +{ + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { + struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); + + if (!net_eq(dev_net(dev), link_net)) { + int id = peernet2id_alloc(dev_net(dev), link_net); + + if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) + return -EMSGSIZE; + } + } + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, @@ -1451,17 +1468,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } - if (dev->rtnl_link_ops && - dev->rtnl_link_ops->get_link_net) { - struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); - - if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id_alloc(dev_net(dev), link_net); - - if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) - goto nla_put_failure; - } - } + if (rtnl_fill_link_netnsid(skb, dev)) + goto nla_put_failure; if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) goto nla_put_failure; -- cgit v1.2.3 From 4c82a95e523721637d44e0e8d879fa11fa825eec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 13:58:43 +0200 Subject: rtnetlink: rtnl_have_link_slave_info doesn't need rtnl it can be switched to rcu. Reviewed-by: David Ahern Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d524609c587c..e6955da0d58d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -522,11 +522,15 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev, static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; + bool ret = false; - master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + rcu_read_lock(); + + master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) - return true; - return false; + ret = true; + rcu_read_unlock(); + return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, -- cgit v1.2.3 From 5af48b59f35cf712793badabe1a574a0d0ce3bd3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 27 Sep 2017 16:12:44 +0300 Subject: net: bridge: add per-port group_fwd_mask with less restrictions We need to be able to transparently forward most link-local frames via tunnels (e.g. vxlan, qinq). Currently the bridge's group_fwd_mask has a mask which restricts the forwarding of STP and LACP, but we need to be able to forward these over tunnels and control that forwarding on a per-port basis thus add a new per-port group_fwd_mask option which only disallows mac pause frames to be forwarded (they're always dropped anyway). The patch does not change the current default situation - all of the others are still restricted unless configured for forwarding. We have successfully tested this patch with LACP and STP forwarding over VxLAN and qinq tunnels. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_input.c | 1 + net/bridge/br_netlink.c | 14 +++++++++++++- net/bridge/br_private.h | 10 +++++++++- net/bridge/br_sysfs_if.c | 18 ++++++++++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7637f58c1226..7cb613776b31 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -289,6 +289,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) * * Others reserved for future standardization */ + fwd_mask |= p->group_fwd_mask; switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3bc890716c89..dea88a255d26 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void) #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + 0; } @@ -208,7 +209,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, p->topology_change_ack) || nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & - BR_VLAN_TUNNEL))) + BR_VLAN_TUNNEL)) || + nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask)) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -637,6 +639,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, + [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, }; /* Change the state of the port and notify spanning tree */ @@ -773,6 +776,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) return err; } #endif + + if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { + u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]); + + if (fwd_mask & BR_GROUPFWD_MACPAUSE) + return -EINVAL; + p->group_fwd_mask = fwd_mask; + } + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e870cfc85b14..020c709a017f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -36,7 +36,14 @@ /* Control of forwarding link local multicast */ #define BR_GROUPFWD_DEFAULT 0 /* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */ -#define BR_GROUPFWD_RESTRICTED 0x0007u +enum { + BR_GROUPFWD_STP = BIT(0), + BR_GROUPFWD_MACPAUSE = BIT(1), + BR_GROUPFWD_LACP = BIT(2), +}; + +#define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \ + BR_GROUPFWD_LACP) /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */ #define BR_GROUPFWD_8021AD 0xB801u @@ -268,6 +275,7 @@ struct net_bridge_port { #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif + u16 group_fwd_mask; }; #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 5d5d413a6cf8..9110d5e56085 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -165,6 +165,23 @@ static int store_flush(struct net_bridge_port *p, unsigned long v) } static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); +static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%#x\n", p->group_fwd_mask); +} + +static int store_group_fwd_mask(struct net_bridge_port *p, + unsigned long v) +{ + if (v & BR_GROUPFWD_MACPAUSE) + return -EINVAL; + p->group_fwd_mask = v; + + return 0; +} +static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask, + store_group_fwd_mask); + BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); @@ -223,6 +240,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_proxyarp_wifi, &brport_attr_multicast_flood, &brport_attr_broadcast_flood, + &brport_attr_group_fwd_mask, NULL }; -- cgit v1.2.3 From cf5d74b85ef40c202c76d90959db4d850f301b95 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Wed, 27 Sep 2017 18:30:58 +0200 Subject: tcp: fix under-evaluated ssthresh in TCP Vegas With the commit 76174004a0f19785 (tcp: do not slow start when cwnd equals ssthresh), the comparison to the reduced cwnd in tcp_vegas_ssthresh() would under-evaluate the ssthresh. Signed-off-by: Hoang Tran Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 218cfcc77650..ee113ff15fd0 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { - return min(tp->snd_ssthresh, tp->snd_cwnd-1); + return min(tp->snd_ssthresh, tp->snd_cwnd); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) -- cgit v1.2.3 From c7c3e5913bf18eda3cf38932bebdce48351baac9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Sep 2017 19:08:00 -0700 Subject: net: ipv4: remove fib_weight fib_weight in fib_info is set but not used. Remove it and the helpers for setting it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 57a5d48acee8..be0874620ecc 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi) atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); } endfor_nexthops(fi); } - -static inline void fib_add_weight(struct fib_info *fi, - const struct fib_nh *nh) -{ - fi->fib_weight += nh->nh_weight; -} - #else /* CONFIG_IP_ROUTE_MULTIPATH */ #define fib_rebalance(fi) do { } while (0) -#define fib_add_weight(fi, nh) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); - fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) fib_rebalance(fi); -- cgit v1.2.3 From fa8fefaa678ea390b873195d19c09930da84a4bb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Sep 2017 20:41:59 -0700 Subject: net: ipv4: remove fib_info arg to fib_check_nh fib_check_nh does not use the fib_info arg; remove t. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index be0874620ecc..467b3c15395f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -766,8 +766,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh, struct netlink_ext_ack *extack) +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -1250,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh, extack); + err = fib_check_nh(cfg, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) -- cgit v1.2.3 From 1f372c7bfb23286d2bf4ce0423ab488e86b74bb2 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Mon, 25 Sep 2017 22:01:36 +0100 Subject: net: ipv6: send NS for DAD when link operationally up The NS for DAD are sent on admin up as long as a valid qdisc is found. A race condition exists by which these packets will not egress the interface if the operational state of the lower device is not yet up. The solution is to delay DAD until the link is operationally up according to RFC2863. Rather than only doing this, follow the existing code checks by deferring IPv6 device initialization altogether. The fix allows DAD on devices like tunnels that are controlled by userspace control plane. The fix has no impact on regular deployments, but means that there is no IPv6 connectivity until the port has been opened in the case of port-based network access control, which should be desirable. Signed-off-by: Mike Manning Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 13c3b697f8c0..f553f72d0bee 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -303,10 +303,10 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .disable_policy = 0, }; -/* Check if a valid qdisc is available */ -static inline bool addrconf_qdisc_ok(const struct net_device *dev) +/* Check if link is ready: is it up and is a valid qdisc available */ +static inline bool addrconf_link_ready(const struct net_device *dev) { - return !qdisc_tx_is_noop(dev); + return netif_oper_up(dev) && !qdisc_tx_is_noop(dev); } static void addrconf_del_rs_timer(struct inet6_dev *idev) @@ -451,7 +451,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->token = in6addr_any; - if (netif_running(dev) && addrconf_qdisc_ok(dev)) + if (netif_running(dev) && addrconf_link_ready(dev)) ndev->if_flags |= IF_READY; ipv6_mc_init_dev(ndev); @@ -3403,7 +3403,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, /* restore routes for permanent addresses */ addrconf_permanent_addr(dev); - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", dev->name); @@ -3418,7 +3418,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } } else if (event == NETDEV_CHANGE) { - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is still not ready. */ break; } -- cgit v1.2.3 From 84e14fe353de7624872e582887712079ba0b2d56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 26 Sep 2017 21:32:42 -0700 Subject: net-ipv6: add support for sockopt(SOL_IPV6, IPV6_FREEBIND) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far we've been relying on sockopt(SOL_IP, IP_FREEBIND) being usable even on IPv6 sockets. However, it turns out it is perfectly reasonable to want to set freebind on an AF_INET6 SOCK_RAW socket - but there is no way to set any SOL_IP socket option on such a socket (they're all blindly errored out). One use case for this is to allow spoofing src ip on a raw socket via sendmsg cmsg. Tested: built, and booted # python >>> import socket >>> SOL_IP = socket.SOL_IP >>> SOL_IPV6 = socket.IPPROTO_IPV6 >>> IP_FREEBIND = 15 >>> IPV6_FREEBIND = 78 >>> s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM, 0) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 0 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 0 >>> s.setsockopt(SOL_IPV6, IPV6_FREEBIND, 1) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 1 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 1 Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a5e466d4e093..b9404feabd78 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -377,6 +377,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; break; + case IPV6_FREEBIND: + if (optlen < sizeof(int)) + goto e_inval; + /* we also don't have a separate freebind bit for IPV6 */ + inet_sk(sk)->freebind = valbool; + retv = 0; + break; + case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; @@ -1214,6 +1222,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = inet_sk(sk)->transparent; break; + case IPV6_FREEBIND: + val = inet_sk(sk)->freebind; + break; + case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; -- cgit v1.2.3 From 706cc9f51d9a22528af18d4b3ffbd17b30a1d3b0 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 30 Sep 2017 09:24:28 +0200 Subject: batman-adv: Add argument names for function ptr definitions checkpatch started to report unnamed arguments in function pointer definitions. Add the corresponding names to these definitions to avoid this warning. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 033819aefc39..4daed7ad46f2 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -73,8 +73,8 @@ * list traversals just rcu-locked */ struct list_head batadv_hardif_list; -static int (*batadv_rx_handler[256])(struct sk_buff *, - struct batadv_hard_iface *); +static int (*batadv_rx_handler[256])(struct sk_buff *skb, + struct batadv_hard_iface *recv_if); unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; @@ -540,8 +540,8 @@ batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)) { - int (*curr)(struct sk_buff *, - struct batadv_hard_iface *); + int (*curr)(struct sk_buff *skb, + struct batadv_hard_iface *recv_if); curr = batadv_rx_handler[packet_type]; if (curr != batadv_recv_unhandled_packet && -- cgit v1.2.3 From ef739d8aabaa26dd99f5661e38a86f4d85d8dfed Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 29 Sep 2017 14:34:22 +0100 Subject: net: ipmr: make function ipmr_notifier_init static The function ipmr_notifier_init is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: warning: symbol 'ipmr_notifier_init' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 292a8e80bdfa..a844738b38bd 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3224,7 +3224,7 @@ static const struct fib_notifier_ops ipmr_notifier_ops_template = { .owner = THIS_MODULE, }; -int __net_init ipmr_notifier_init(struct net *net) +static int __net_init ipmr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; -- cgit v1.2.3 From b1c49d14200dae47544f7ea6ee9040ded01f8425 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 29 Sep 2017 15:01:16 +0100 Subject: net_sched: remove redundant assignment to ret The assignment of -EINVAL to variable ret is redundant as it is being overwritten on the following error exit paths or to the return value from the following call to basic_set_parms. Fix this up by removing it. Cleans up clang warning message: net/sched/cls_basic.c:185:2: warning: Value stored to 'err' is never read Fixes: 1d8134fea2eb ("net_sched: use idr to allocate basic filter handles") Signed-off-by: Colin Ian King Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_basic.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index cfeb6f158566..700b345b07f9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -182,7 +182,6 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - err = -EINVAL; if (handle) { fnew->handle = handle; if (!fold) { -- cgit v1.2.3 From 3775b1b7f0c330e59c434d1852d7762ae0a9c164 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:15 -0400 Subject: net: dsa: add master helper to look up slaves The DSA tagging code does not need to know about the DSA architecture, it only needs to return the slave device corresponding to the source port index (and eventually the source device index for cascade-capable switches) parsed from the frame received on the master device. For this purpose, provide an inline dsa_master_get_slave helper which validates the device and port indexes and look up the slave device. This makes the tagging rcv functions more concise and robust, and also makes dsa_get_cpu_port obsolete. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 24 +++++++++++++++++++----- net/dsa/tag_brcm.c | 9 ++------- net/dsa/tag_dsa.c | 18 ++---------------- net/dsa/tag_edsa.c | 18 ++---------------- net/dsa/tag_ksz.c | 9 +++------ net/dsa/tag_lan9303.c | 20 ++------------------ net/dsa/tag_mtk.c | 16 +++------------- net/dsa/tag_qca.c | 17 +++-------------- net/dsa/tag_trailer.c | 9 +++------ 9 files changed, 39 insertions(+), 101 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index eccc62776283..d429505dc4e7 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -113,6 +113,25 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], int dsa_master_ethtool_setup(struct net_device *dev); void dsa_master_ethtool_restore(struct net_device *dev); +static inline struct net_device *dsa_master_get_slave(struct net_device *dev, + int device, int port) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + + if (device < 0 || device >= DSA_MAX_SWITCHES) + return NULL; + + ds = dst->ds[device]; + if (!ds) + return NULL; + + if (port < 0 || port >= ds->num_ports) + return NULL; + + return ds->ports[port].netdev; +} + /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); @@ -182,9 +201,4 @@ static inline struct net_device *dsa_master_netdev(struct dsa_slave_priv *p) return p->dp->cpu_dp->netdev; } -static inline struct dsa_port *dsa_get_cpu_port(struct dsa_switch_tree *dst) -{ - return dst->cpu_dp; -} - #endif diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index dbb016434ace..8e4bdb9d9ae3 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -92,9 +92,6 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; int source_port; u8 *brcm_tag; @@ -117,8 +114,8 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Locate which port this is coming from */ source_port = brcm_tag[3] & BRCM_EG_PID_MASK; - /* Validate port against switch setup, either the port is totally */ - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) return NULL; /* Remove Broadcom tag and update checksum */ @@ -129,8 +126,6 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->data - ETH_HLEN - BRCM_TAG_LEN, 2 * ETH_ALEN); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index fbf9ca954773..c77218f173d1 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -67,8 +67,6 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; u8 *dsa_header; int source_device; int source_port; @@ -93,18 +91,8 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = dsa_header[0] & 0x1f; source_port = (dsa_header[1] >> 3) & 0x1f; - /* - * Check that the source device exists and that the source - * port is a registered DSA port. - */ - if (source_device >= DSA_MAX_SWITCHES) - return NULL; - - ds = dst->ds[source_device]; - if (!ds) - return NULL; - - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_get_slave(dev, source_device, source_port); + if (!skb->dev) return NULL; /* @@ -153,8 +141,6 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 76367ba1b2e2..0b83cbe0c9e8 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -80,8 +80,6 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; u8 *edsa_header; int source_device; int source_port; @@ -106,18 +104,8 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = edsa_header[0] & 0x1f; source_port = (edsa_header[1] >> 3) & 0x1f; - /* - * Check that the source device exists and that the source - * port is a registered DSA port. - */ - if (source_device >= DSA_MAX_SWITCHES) - return NULL; - - ds = dst->ds[source_device]; - if (!ds) - return NULL; - - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_get_slave(dev, source_device, source_port); + if (!skb->dev) return NULL; /* @@ -172,8 +160,6 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 010ca0a336c4..b241c990cde0 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -80,22 +80,19 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; u8 *tag; int source_port; tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; source_port = tag[0] & 7; - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) return NULL; pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 0b9826105e42..4f211e56c81f 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -71,17 +71,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { u16 *lan9303_tag; - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; unsigned int source_port; - ds = dst->ds[0]; - - if (unlikely(!ds)) { - dev_warn_ratelimited(&dev->dev, "Dropping packet, due to missing DSA switch device\n"); - return NULL; - } - if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) { dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull\n"); @@ -103,16 +94,12 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, source_port = ntohs(lan9303_tag[1]) & 0x3; - if (source_port >= ds->num_ports) { + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n"); return NULL; } - if (!ds->ports[source_port].netdev) { - dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid netdev or device\n"); - return NULL; - } - /* remove the special VLAN tag between the MAC addresses * and the current ethertype field. */ @@ -120,9 +107,6 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), 2 * ETH_ALEN); - /* forward the packet to the dedicated interface */ - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index ec8ee5f43255..968586c5d40e 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -46,8 +46,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; int port; __be16 *phdr, hdr; @@ -68,20 +66,12 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->data - ETH_HLEN - MTK_HDR_LEN, 2 * ETH_ALEN); - /* This protocol doesn't support cascading multiple - * switches so it's safe to assume the switch is first - * in the tree. - */ - ds = dst->ds[0]; - if (!ds) - return NULL; - /* Get source port information */ port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK); - if (!ds->ports[port].netdev) - return NULL; - skb->dev = ds->ports[port].netdev; + skb->dev = dsa_master_get_slave(dev, 0, port); + if (!skb->dev) + return NULL; return skb; } diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 1d4c70711c0f..8d33d9ebf910 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -65,9 +65,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds; u8 ver; int port; __be16 *phdr, hdr; @@ -92,20 +89,12 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN, ETH_HLEN - QCA_HDR_LEN); - /* This protocol doesn't support cascading multiple switches so it's - * safe to assume the switch is first in the tree - */ - ds = cpu_dp->ds; - if (!ds) - return NULL; - /* Get source port information */ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); - if (!ds->ports[port].netdev) - return NULL; - /* Update skb & forward the frame accordingly */ - skb->dev = ds->ports[port].netdev; + skb->dev = dsa_master_get_slave(dev, 0, port); + if (!skb->dev) + return NULL; return skb; } diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d2fd4923aa3e..61668be267f5 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -58,9 +58,6 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; u8 *trailer; int source_port; @@ -73,13 +70,13 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; source_port = trailer[1] & 7; - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) return NULL; pskb_trim_rcsum(skb, skb->len - 4); - skb->dev = ds->ports[source_port].netdev; - return skb; } -- cgit v1.2.3 From 7ec764eef934409c4e15539440c31bca0b8de005 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:16 -0400 Subject: net: dsa: use cpu_dp in master code Make it clear that the master device is linked to a CPU port by using "cpu_dp" for the dsa_port variable in master.c instead of "port", then use a "port" variable to describe the port index, as usually seen in other places of DSA core. This will make the future patch touching dsa_ptr more readable. There is no functional changes. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/master.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/dsa/master.c b/net/dsa/master.c index 5e5147ec5a44..ef15d35f1574 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -17,9 +17,10 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, uint64_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; - const struct ethtool_ops *ops = port->orig_ethtool_ops; + struct dsa_port *cpu_dp = dst->cpu_dp; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; int count = 0; if (ops && ops->get_sset_count && ops->get_ethtool_stats) { @@ -28,15 +29,15 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, } if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, port->index, data + count); + ds->ops->get_ethtool_stats(ds, port, data + count); } static int dsa_master_get_sset_count(struct net_device *dev, int sset) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; - const struct ethtool_ops *ops = port->orig_ethtool_ops; + struct dsa_port *cpu_dp = dst->cpu_dp; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; int count = 0; if (ops && ops->get_sset_count) @@ -52,16 +53,17 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; - const struct ethtool_ops *ops = port->orig_ethtool_ops; + struct dsa_port *cpu_dp = dst->cpu_dp; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; int len = ETH_GSTRING_LEN; int mcount = 0, count; unsigned int i; uint8_t pfx[4]; uint8_t *ndata; - snprintf(pfx, sizeof(pfx), "p%.2d", port->index); + snprintf(pfx, sizeof(pfx), "p%.2d", port); /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; @@ -76,7 +78,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, * the output after to prepend our CPU port prefix we * constructed earlier */ - ds->ops->get_strings(ds, port->index, ndata); + ds->ops->get_strings(ds, port, ndata); count = ds->ops->get_sset_count(ds); for (i = 0; i < count; i++) { memmove(ndata + (i * len + sizeof(pfx)), @@ -89,17 +91,17 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; + struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_switch *ds = cpu_dp->ds; struct ethtool_ops *ops; ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); if (!ops) return -ENOMEM; - port->orig_ethtool_ops = dev->ethtool_ops; - if (port->orig_ethtool_ops) - memcpy(ops, port->orig_ethtool_ops, sizeof(*ops)); + cpu_dp->orig_ethtool_ops = dev->ethtool_ops; + if (cpu_dp->orig_ethtool_ops) + memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops)); ops->get_sset_count = dsa_master_get_sset_count; ops->get_ethtool_stats = dsa_master_get_ethtool_stats; @@ -113,8 +115,8 @@ int dsa_master_ethtool_setup(struct net_device *dev) void dsa_master_ethtool_restore(struct net_device *dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; + struct dsa_port *cpu_dp = dst->cpu_dp; - dev->ethtool_ops = port->orig_ethtool_ops; - port->orig_ethtool_ops = NULL; + dev->ethtool_ops = cpu_dp->orig_ethtool_ops; + cpu_dp->orig_ethtool_ops = NULL; } -- cgit v1.2.3 From 62fc95876298987c35e9fb10badd467f4787aae7 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:17 -0400 Subject: net: dsa: use temporary dsa_device_ops variable When resolving the DSA tagging protocol used by a CPU switch, use a temporary "tag_ops" variable to store the dsa_device_ops instead of using directly dst->tag_ops. This will make the future patches moving this pointer around easier to read. There is no functional changes. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 8 +++++--- net/dsa/legacy.c | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dcccaebde708..6a10c5c1639f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -485,6 +485,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) { + const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; struct net_device *ethernet_dev; struct device_node *ethernet; @@ -514,13 +515,14 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, ds->cpu_port_mask |= BIT(index); tag_protocol = ds->ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) { + tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); ds->cpu_port_mask &= ~BIT(index); - return PTR_ERR(dst->tag_ops); + return PTR_ERR(tag_ops); } + dst->tag_ops = tag_ops; dst->rcv = dst->tag_ops->rcv; return 0; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index ae505d8e4417..8e849013f69d 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -144,13 +144,15 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, * switch. */ if (dst->cpu_dp->ds == ds) { + const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; tag_protocol = ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) - return PTR_ERR(dst->tag_ops); + tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(tag_ops)) + return PTR_ERR(tag_ops); + dst->tag_ops = tag_ops; dst->rcv = dst->tag_ops->rcv; } -- cgit v1.2.3 From 152402483ed75b167d5628d414e876ffa7a6d4c4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:18 -0400 Subject: net: dsa: add tagging ops to port The DSA tagging protocol operations are specific to each CPU port, thus the dsa_device_ops pointer belongs to the dsa_port structure. >From now on assign a slave's xmit copy from its CPU port tagging operations. This will ease the future support for multiple CPU ports. Also keep the tag_ops at the beginning of the dsa_port structure so that we ensure copies for hot path are in cacheline 1. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 1 + net/dsa/dsa_priv.h | 2 +- net/dsa/legacy.c | 1 + net/dsa/slave.c | 3 +-- 4 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 6a10c5c1639f..9eac4726dc0c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -522,6 +522,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, return PTR_ERR(tag_ops); } + dst->cpu_dp->tag_ops = tag_ops; dst->tag_ops = tag_ops; dst->rcv = dst->tag_ops->rcv; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d429505dc4e7..9397291bb3aa 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -66,7 +66,7 @@ struct dsa_notifier_vlan_info { }; struct dsa_slave_priv { - /* Copy of dp->ds->dst->tag_ops->xmit for faster access in hot path */ + /* Copy of CPU port xmit for faster access in slave transmit hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 8e849013f69d..4d374541815a 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -152,6 +152,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, if (IS_ERR(tag_ops)) return PTR_ERR(tag_ops); + dst->cpu_dp->tag_ops = tag_ops; dst->tag_ops = tag_ops; dst->rcv = dst->tag_ops->rcv; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index bf8800de13c1..4b634db05cee 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1117,7 +1117,6 @@ int dsa_slave_resume(struct net_device *slave_dev) int dsa_slave_create(struct dsa_port *port, const char *name) { struct dsa_switch *ds = port->ds; - struct dsa_switch_tree *dst = ds->dst; struct net_device *master; struct net_device *slave_dev; struct dsa_slave_priv *p; @@ -1162,7 +1161,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name) } p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - p->xmit = dst->tag_ops->xmit; + p->xmit = cpu_dp->tag_ops->xmit; p->old_pause = -1; p->old_link = -1; -- cgit v1.2.3 From 3e41f93b358a8800194b87995ad076fc50919719 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:19 -0400 Subject: net: dsa: prepare master receive hot path In preparation to make DSA master devices point to their corresponding CPU port instead of the whole tree, add copies of dst and rcv in the dsa_port structure so that we keep fast access in the receive hot path. Also keep the copies at the beginning of the dsa_port structure in order to ensure they are available in cacheline 1. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 4 ++++ net/dsa/legacy.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 9eac4726dc0c..b71e3bb478e4 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -524,7 +524,11 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, dst->cpu_dp->tag_ops = tag_ops; dst->tag_ops = tag_ops; + + /* Make a few copies for faster access in master receive hot path */ + dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; dst->rcv = dst->tag_ops->rcv; + dst->cpu_dp->dst = dst; return 0; } diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 4d374541815a..96c7e3f8b8bb 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -154,7 +154,11 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, dst->cpu_dp->tag_ops = tag_ops; dst->tag_ops = tag_ops; + + /* Few copies for faster access in master receive hot path */ + dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; dst->rcv = dst->tag_ops->rcv; + dst->cpu_dp->dst = dst; } memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); -- cgit v1.2.3 From 2f657a600409f1961d67642fe384a9d4be71d36a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:20 -0400 Subject: net: dsa: change dsa_ptr for a dsa_port With DSA, a master net device (CPU facing interface) has a dsa_ptr pointer to which hangs a dsa_switch_tree. This is not correct because a master interface is wired to a dedicated switch port, and because we can theoretically have several master interfaces pointing to several CPU ports of the same switch fabric. Change the master interface's dsa_ptr for the CPU dsa_port pointer. This is a step towards supporting multiple CPU ports. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 6 +++--- net/dsa/dsa2.c | 2 +- net/dsa/dsa_priv.h | 3 ++- net/dsa/legacy.c | 2 +- net/dsa/master.c | 15 +++++---------- 5 files changed, 12 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 81c852e32821..51ca2a524a27 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -160,12 +160,12 @@ EXPORT_SYMBOL_GPL(dsa_dev_to_net_device); static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { - struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; struct pcpu_sw_netstats *s; struct dsa_slave_priv *p; - if (unlikely(dst == NULL)) { + if (unlikely(!cpu_dp)) { kfree_skb(skb); return 0; } @@ -174,7 +174,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb) return 0; - nskb = dst->rcv(skb, dev, pt); + nskb = cpu_dp->rcv(skb, dev, pt); if (!nskb) { kfree_skb(skb); return 0; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index b71e3bb478e4..62302558f38c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -438,7 +438,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) * sent to the tag format's receive function. */ wmb(); - dst->cpu_dp->netdev->dsa_ptr = dst; + dst->cpu_dp->netdev->dsa_ptr = dst->cpu_dp; err = dsa_master_ethtool_setup(dst->cpu_dp->netdev); if (err) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9397291bb3aa..2850077cc9cc 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -116,7 +116,8 @@ void dsa_master_ethtool_restore(struct net_device *dev); static inline struct net_device *dsa_master_get_slave(struct net_device *dev, int device, int port) { - struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_switch *ds; if (device < 0 || device >= DSA_MAX_SWITCHES) diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 96c7e3f8b8bb..71917505a5cc 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -607,7 +607,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, * sent to the tag format's receive function. */ wmb(); - dev->dsa_ptr = dst; + dev->dsa_ptr = dst->cpu_dp; return dsa_master_ethtool_setup(dst->cpu_dp->netdev); } diff --git a/net/dsa/master.c b/net/dsa/master.c index ef15d35f1574..5f3f57e372e0 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -16,8 +16,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; struct dsa_switch *ds = cpu_dp->ds; int port = cpu_dp->index; @@ -34,8 +33,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, static int dsa_master_get_sset_count(struct net_device *dev, int sset) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; struct dsa_switch *ds = cpu_dp->ds; int count = 0; @@ -52,8 +50,7 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset) static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; struct dsa_switch *ds = cpu_dp->ds; int port = cpu_dp->index; @@ -90,8 +87,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, int dsa_master_ethtool_setup(struct net_device *dev) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch *ds = cpu_dp->ds; struct ethtool_ops *ops; @@ -114,8 +110,7 @@ int dsa_master_ethtool_setup(struct net_device *dev) void dsa_master_ethtool_restore(struct net_device *dev) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; dev->ethtool_ops = cpu_dp->orig_ethtool_ops; cpu_dp->orig_ethtool_ops = NULL; -- cgit v1.2.3 From aa193d9b1d7ea6893ce24a9d141f676950563987 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:21 -0400 Subject: net: dsa: remove tag ops from the switch tree Now that the dsa_ptr is a dsa_port instance, there is no need to keep the tag operations in the dsa_switch_tree structure. Remove it. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 2 -- net/dsa/legacy.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 62302558f38c..54ed054777bd 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -523,11 +523,9 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, } dst->cpu_dp->tag_ops = tag_ops; - dst->tag_ops = tag_ops; /* Make a few copies for faster access in master receive hot path */ dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; - dst->rcv = dst->tag_ops->rcv; dst->cpu_dp->dst = dst; return 0; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 71917505a5cc..19ff6e0a21dc 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -153,11 +153,9 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, return PTR_ERR(tag_ops); dst->cpu_dp->tag_ops = tag_ops; - dst->tag_ops = tag_ops; /* Few copies for faster access in master receive hot path */ dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; - dst->rcv = dst->tag_ops->rcv; dst->cpu_dp->dst = dst; } -- cgit v1.2.3 From e1cfcbe82b4534bd0f99fef92a6d33843fd85e0e Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:40 +0800 Subject: ipv4: Namespaceify tcp_fastopen knob Different namespace application might require enable TCP Fast Open feature independently of the host. This patch series continues making more of the TCP Fast Open related sysctl knobs be per net-namespace. Reported-by: Luca BRUNO Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 7 ++++--- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_fastopen.c | 11 +++++------ net/ipv4/tcp_ipv4.c | 2 ++ 5 files changed, 20 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e31108e5ef79..ddd126d120ac 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; - int err; + int err, tcp_fastopen; lock_sock(sk); @@ -217,8 +217,9 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && - (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); tcp_fastopen_init_key_once(true); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0d3c038d7b04..e31e853cf486 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -400,13 +400,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_fastopen", - .data = &sysctl_tcp_fastopen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "tcp_fastopen_key", .mode = 0600, @@ -1085,6 +1078,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_fastopen", + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402720ab..dac56c4ad357 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1126,7 +1126,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -2759,7 +2759,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e3c33220c418..31b08ec38cb8 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -9,8 +9,6 @@ #include #include -int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; - struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); @@ -279,21 +277,22 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc) { - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); - if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } - if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ @@ -347,7 +346,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, return false; } - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { cookie->len = -1; return true; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..88409b13c9d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2472,6 +2472,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + return 0; fail: tcp_sk_exit(net); -- cgit v1.2.3 From dd000598a39b6937fcefdf143720ec9fb5250e72 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:41 +0800 Subject: ipv4: Remove the 'publish' logic in tcp_fastopen_init_key_once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'publish' logic is not necessary after commit dfea2aa65424 ("tcp: Do not call tcp_fastopen_reset_cipher from interrupt context"), because in tcp_fastopen_cookie_gen,it wouldn't call tcp_fastopen_init_key_once. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 5 ----- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_fastopen.c | 4 ++-- 4 files changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ddd126d120ac..e73ce79d7176 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -222,7 +222,7 @@ int inet_listen(struct socket *sock, int backlog) (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(); } err = inet_csk_listen_start(sk, backlog); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e31e853cf486..f6324ead0e19 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -282,11 +282,6 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - /* Generate a dummy secret but don't publish it. This - * is needed so we don't regenerate a new key on the - * first invocation of tcp_fastopen_cookie_gen - */ - tcp_fastopen_init_key_once(false); tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dac56c4ad357..4e395452d69f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2749,7 +2749,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(); fastopen_queue_tune(sk, val); } else { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 31b08ec38cb8..8c8f0f0af59d 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -13,7 +13,7 @@ struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); -void tcp_fastopen_init_key_once(bool publish) +void tcp_fastopen_init_key_once(void) { static u8 key[TCP_FASTOPEN_KEY_LENGTH]; @@ -23,7 +23,7 @@ void tcp_fastopen_init_key_once(bool publish) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key)) && publish) + if (net_get_random_once(key, sizeof(key))) tcp_fastopen_reset_cipher(key, sizeof(key)); } -- cgit v1.2.3 From 437138485656c41e32b8c63c0987cfa0348be0e6 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:42 +0800 Subject: ipv4: Namespaceify tcp_fastopen_key knob Different namespace application might require different tcp_fastopen_key independently of the host. David Miller pointed out there is a leak without releasing the context of tcp_fastopen_key during netns teardown. So add the release action in exit_batch path. Tested: 1. Container namespace: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 2817fff2-f803cf97-eadfd1f3-78c0992b cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: 1e5dd82a8c492ca9 2. Host: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 107d7c5f-68eb2ac7-02fb06e6-ed341702 cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: e213c02bf0afbc8a Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 21 ++++++++------- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_fastopen.c | 64 +++++++++++++++++++++++++++++++--------------- net/ipv4/tcp_ipv4.c | 6 +++++ 5 files changed, 63 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e73ce79d7176..43a1bbed7a42 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -222,7 +222,7 @@ int inet_listen(struct socket *sock, int backlog) (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(); + tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk, backlog); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index f6324ead0e19..20e19fe78dbd 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -251,10 +251,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } -static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, +static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; int ret; @@ -265,7 +267,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(tcp_fastopen_ctx); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else @@ -282,7 +284,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); + tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: @@ -395,12 +397,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_fastopen_key", - .mode = 0600, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), - .proc_handler = proc_tcp_fastopen_key, - }, { .procname = "tcp_fastopen_blackhole_timeout_sec", .data = &sysctl_tcp_fastopen_blackhole_timeout, @@ -1080,6 +1076,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_fastopen_key", + .mode = 0600, + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + .proc_handler = proc_tcp_fastopen_key, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e395452d69f..23225c98d287 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2749,7 +2749,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(); + tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8c8f0f0af59d..4eae44ac3cb0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -9,13 +9,18 @@ #include #include -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; - -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); - -void tcp_fastopen_init_key_once(void) +void tcp_fastopen_init_key_once(struct net *net) { - static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctxt; + + rcu_read_lock(); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctxt) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. @@ -23,8 +28,8 @@ void tcp_fastopen_init_key_once(void) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key))) - tcp_fastopen_reset_cipher(key, sizeof(key)); + get_random_bytes(key, sizeof(key)); + tcp_fastopen_reset_cipher(net, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) @@ -35,7 +40,22 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) kfree(ctx); } -int tcp_fastopen_reset_cipher(void *key, unsigned int len) +void tcp_fastopen_ctx_destroy(struct net *net) +{ + struct tcp_fastopen_context *ctxt; + + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + + ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + + if (ctxt) + call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); +} + +int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len) { int err; struct tcp_fastopen_context *ctx, *octx; @@ -59,26 +79,27 @@ error: kfree(ctx); } memcpy(ctx->key, key, len); - spin_lock(&tcp_fastopen_ctx_lock); + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); - octx = rcu_dereference_protected(tcp_fastopen_ctx, - lockdep_is_held(&tcp_fastopen_ctx_lock)); - rcu_assign_pointer(tcp_fastopen_ctx, ctx); - spin_unlock(&tcp_fastopen_ctx_lock); + octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } -static bool __tcp_fastopen_cookie_gen(const void *path, +static bool __tcp_fastopen_cookie_gen(struct net *net, + const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(tcp_fastopen_ctx); + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; @@ -94,7 +115,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct request_sock *req, +static bool tcp_fastopen_cookie_gen(struct net *net, + struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { @@ -102,7 +124,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(path, foc); + return __tcp_fastopen_cookie_gen(net, path, foc); } #if IS_ENABLED(CONFIG_IPV6) @@ -110,13 +132,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; - if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(buf, foc); + return __tcp_fastopen_cookie_gen(net, buf, foc); } } #endif @@ -296,7 +318,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 88409b13c9d2..49c74c0d0d21 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2473,6 +2473,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_timestamps = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); return 0; fail: @@ -2483,7 +2484,12 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { + struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + + list_for_each_entry(net, net_exit_list, exit_list) + tcp_fastopen_ctx_destroy(net); } static struct pernet_operations __net_initdata tcp_sk_ops = { -- cgit v1.2.3 From 3733be14a32bae288b61ed28341e593baba983af Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:43 +0800 Subject: ipv4: Namespaceify tcp_fastopen_blackhole_timeout knob Different namespace application might require different time period in second to disable Fastopen on active TCP sockets. Tested: Simulate following similar situation that the server's data gets dropped after 3WHS. C ---- syn-data ---> S C <--- syn/ack ----- S C ---- ack --------> S S (accept & write) C? X <- data ------ S [retry and timeout] And then print netstat of TCPFastOpenBlackhole, the counter increased as expected when the firewall blackhole issue is detected and active TFO is disabled. # cat /proc/net/netstat | awk '{print $91}' TCPFastOpenBlackhole 1 Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 20 +++++++++++--------- net/ipv4/tcp_fastopen.c | 30 +++++++++++------------------- net/ipv4/tcp_ipv4.c | 2 ++ 3 files changed, 24 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 20e19fe78dbd..cac8dd309f39 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -355,11 +355,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - tcp_fastopen_active_timeout_reset(); + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } @@ -397,14 +399,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_fastopen_blackhole_timeout_sec", - .data = &sysctl_tcp_fastopen_blackhole_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, - }, { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, @@ -1083,6 +1077,14 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), .proc_handler = proc_tcp_fastopen_key, }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 4eae44ac3cb0..de470e7e586f 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -422,25 +422,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * TFO connection with data exchanges. */ -/* Default to 1hr */ -unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; -static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); -static unsigned long tfo_active_disable_stamp __read_mostly; - /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { - atomic_inc(&tfo_active_disable_times); - tfo_active_disable_stamp = jiffies; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); -} + struct net *net = sock_net(sk); -/* Reset tfo_active_disable_times to 0 */ -void tcp_fastopen_active_timeout_reset(void) -{ - atomic_set(&tfo_active_disable_times, 0); + atomic_inc(&net->ipv4.tfo_active_disable_times); + net->ipv4.tfo_active_disable_stamp = jiffies; + NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable @@ -449,17 +440,18 @@ void tcp_fastopen_active_timeout_reset(void) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - int tfo_da_times = atomic_read(&tfo_active_disable_times); - int multiplier; + unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; + int multiplier; if (!tfo_da_times) return false; /* Limit timout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); - timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; - if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + timeout = multiplier * tfo_bh_timeout * HZ; + if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) return true; /* Mark check bit so we can check for successful active TFO @@ -495,10 +487,10 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) } } } else if (tp->syn_fastopen_ch && - atomic_read(&tfo_active_disable_times)) { + atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) - tcp_fastopen_active_timeout_reset(); + atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 49c74c0d0d21..ad3b5bbaf942 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2474,6 +2474,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return 0; fail: -- cgit v1.2.3 From 504871e602d9a9ea2321d47ca506887417f54e75 Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Tue, 26 Sep 2017 00:39:15 +0530 Subject: mac80211: fix bandwidth computation for TDLS peers Section 11.23.1 of 80211-2016 specification allows TDLS peers to operate on wider bandwidths though they are connected to a BSS which do not support wider bandwidth operations, provided both the peers advertise wider bandwidth capabilities. The existing logic considers the minimum of station's and AP's capability for bandwidth computation. The same logic applies for TDLS peers as well, this restricts operating on wider bandwidths over a TDLS link when the peers are connected to legacy APs. As an example, if 80Mhz VHT capable peers are connected to a 20Mhz 5 GHz AP, then as per the existing logic TDLS operation will be restricted to 20Mhz. Address this problem by not considering BSS capability in bandwidth computation if the participating TDLS peers have wider bandwidth capability. Signed-off-by: Manikanta Pubbisetty [lots of wording/typo fixes] Signed-off-by: Johannes Berg --- net/mac80211/vht.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 19ec2189d3ac..b9276ac849fa 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -386,6 +386,16 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) bw = ieee80211_sta_cap_rx_bw(sta); bw = min(bw, sta->cur_max_bandwidth); + + /* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of + * IEEE80211-2016 specification makes higher bandwidth operation + * possible on the TDLS link if the peers have wider bandwidth + * capability. + */ + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) + return bw; + bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); return bw; -- cgit v1.2.3 From 8f797c288e3a788e9578dd4db08b624a6d4b6a9b Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 26 Sep 2017 13:48:05 +0200 Subject: mac80211: fix STA_SLOW_THRESHOLD htmldocs failure Patch fixes htmldocs build problem: Error(.//net/mac80211/sta_info.h:416): cannot understand prototype: 'STA_SLOW_THRESHOLD 6000 ' Signed-off-by: Stanislaw Gruszka Signed-off-by: Johannes Berg --- net/mac80211/sta_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 3acbdfa9f649..a35c964f6217 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -398,7 +398,7 @@ struct ieee80211_sta_rx_stats { u64 msdu[IEEE80211_NUM_TIDS + 1]; }; -/** +/* * The bandwidth threshold below which the per-station CoDel parameters will be * scaled to be more lenient (to prevent starvation of slow stations). This * value will be scaled by the number of active stations when it is being -- cgit v1.2.3 From 503c1fb98ba3859c13863957c7c65c92371a9e50 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 29 Sep 2017 14:21:49 +0200 Subject: cfg80211/nl80211: add a port authorized event Add an event that indicates that a connection is authorized (i.e. the 4 way handshake was performed by the driver). This event should be sent by the driver after sending a connect/roamed event. This is useful for networks that require 802.1X authentication. In cases that the driver supports 4 way handshake offload, but the 802.1X authentication is managed by user space, the driver needs to inform user space right after the 802.11 association was completed so user space can initialize its 802.1X state machine etc. However, it is also possible that the AP will choose to skip the 802.1X authentication (e.g. when PMKSA caching is used) and proceed with the 4 way handshake immediately. In this case the driver needs to inform user space that 802.1X authentication is no longer required (e.g. to prevent user space from disconnecting since it did not get any EAPOLs from the AP). This is also useful for roaming, in which case it is possible that the driver used the Fast Transition protocol so 802.1X is not required. Since there will now be a dedicated notification indicating that the connection is authorized, the authorized flag can be removed from the roamed event. Drivers can send the new port authorized event right after sending the roamed event to indicate the new AP is already authorized. This therefore reserves the old PORT_AUTHORIZED attribute. Signed-off-by: Avraham Stern Signed-off-by: Johannes Berg --- net/wireless/core.h | 5 +++++ net/wireless/nl80211.c | 34 +++++++++++++++++++++++++++++++--- net/wireless/nl80211.h | 2 ++ net/wireless/sme.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- net/wireless/util.c | 3 +++ 5 files changed, 85 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/core.h b/net/wireless/core.h index 6e809325af3b..35165f42c2a8 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -216,6 +216,7 @@ enum cfg80211_event_type { EVENT_DISCONNECTED, EVENT_IBSS_JOINED, EVENT_STOPPED, + EVENT_PORT_AUTHORIZED, }; struct cfg80211_event { @@ -235,6 +236,9 @@ struct cfg80211_event { u8 bssid[ETH_ALEN]; struct ieee80211_channel *channel; } ij; + struct { + u8 bssid[ETH_ALEN]; + } pa; }; }; @@ -385,6 +389,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, bool wextev); void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info); +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_autodisconnect_wk(struct work_struct *work); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1e39ba3cfd06..90e212db6889 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13830,9 +13830,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, info->req_ie)) || (info->resp_ie && nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len, - info->resp_ie)) || - (info->authorized && - nla_put_flag(msg, NL80211_ATTR_PORT_AUTHORIZED))) + info->resp_ie))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -13846,6 +13844,36 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, nlmsg_free(msg); } +void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PORT_AUTHORIZED); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, GFP_KERNEL); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap) diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index b96933322077..bf9e772a30b9 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -58,6 +58,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_roam_info *info, gfp_t gfp); +void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid); void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 0a49b88070d0..f38ed490e42b 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -960,7 +960,6 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, ev->rm.resp_ie_len = info->resp_ie_len; memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len); ev->rm.bss = info->bss; - ev->rm.authorized = info->authorized; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); @@ -969,6 +968,50 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, } EXPORT_SYMBOL(cfg80211_roamed); +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) +{ + ASSERT_WDEV_LOCK(wdev); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + return; + + if (WARN_ON(!wdev->current_bss) || + WARN_ON(!ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) + return; + + nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, + bssid); +} + +void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, + gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_event *ev; + unsigned long flags; + + if (WARN_ON(!bssid)) + return; + + ev = kzalloc(sizeof(*ev), gfp); + if (!ev) + return; + + ev->type = EVENT_PORT_AUTHORIZED; + memcpy(ev->pa.bssid, bssid, ETH_ALEN); + + /* + * Use the wdev event list so that if there are pending + * connected/roamed events, they will be reported first. + */ + spin_lock_irqsave(&wdev->event_lock, flags); + list_add_tail(&ev->list, &wdev->event_list); + spin_unlock_irqrestore(&wdev->event_lock, flags); + queue_work(cfg80211_wq, &rdev->event_work); +} +EXPORT_SYMBOL(cfg80211_port_authorized); + void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap) { diff --git a/net/wireless/util.c b/net/wireless/util.c index 7a1fcc6ee060..ff21c314a609 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -846,6 +846,9 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) case EVENT_STOPPED: __cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; + case EVENT_PORT_AUTHORIZED: + __cfg80211_port_authorized(wdev, ev->pa.bssid); + break; } wdev_unlock(wdev); -- cgit v1.2.3 From a38402bc50709aac76796a955a15152a76e3fd4e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 2 Oct 2017 10:41:16 +0200 Subject: flow_dissector: dissect tunnel info Move dissection of tunnel info from the flower classifier to the flow dissector where all other dissection occurs. This should not have any behavioural affect on other users of the flow dissector. Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++ net/sched/cls_flower.c | 25 ------------ 2 files changed, 100 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0a977373d003..1f5caafb4492 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -115,6 +116,102 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +static void +skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *ctrl; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) + return; + + ctrl = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + target_container); + ctrl->addr_type = type; +} + +static void +__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct ip_tunnel_info *info; + struct ip_tunnel_key *key; + + /* A quick check to see if there might be something to do. */ + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS)) + return; + + info = skb_tunnel_info(skb); + if (!info) + return; + + key = &info->key; + + switch (ip_tunnel_info_af(info)) { + case AF_INET: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + struct flow_dissector_key_ipv4_addrs *ipv4; + + ipv4 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + target_container); + ipv4->src = key->u.ipv4.src; + ipv4->dst = key->u.ipv4.dst; + } + break; + case AF_INET6: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *ipv6; + + ipv6 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + target_container); + ipv6->src = key->u.ipv6.src; + ipv6->dst = key->u.ipv6.dst; + } + break; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { + struct flow_dissector_key_keyid *keyid; + + keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID, + target_container); + keyid->keyid = tunnel_id_to_key32(key->tun_id); + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { + struct flow_dissector_key_ports *tp; + + tp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + target_container); + tp->src = key->tp_src; + tp->dst = key->tp_dst; + } +} + static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -478,6 +575,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + __skb_flow_dissect_tunnel_info(skb, flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d230cb4c8094..db831ac708f6 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -152,37 +152,12 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_fl_filter *f; struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; - struct ip_tunnel_info *info; if (!atomic_read(&head->ht.nelems)) return -1; fl_clear_masked_range(&skb_key, &head->mask); - info = skb_tunnel_info(skb); - if (info) { - struct ip_tunnel_key *key = &info->key; - - switch (ip_tunnel_info_af(info)) { - case AF_INET: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV4_ADDRS; - skb_key.enc_ipv4.src = key->u.ipv4.src; - skb_key.enc_ipv4.dst = key->u.ipv4.dst; - break; - case AF_INET6: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV6_ADDRS; - skb_key.enc_ipv6.src = key->u.ipv6.src; - skb_key.enc_ipv6.dst = key->u.ipv6.dst; - break; - } - - skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); - skb_key.enc_tp.src = key->tp_src; - skb_key.enc_tp.dst = key->tp_dst; - } - skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, * so do it rather here. -- cgit v1.2.3 From 5d8b3e69fc5e5ccafc9db1251bb7c78a8622fddd Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:07 +0200 Subject: ipv4: ipmr: Add the parent ID field to VIF struct In order to allow the ipmr module to do partial multicast forwarding according to the device parent ID, add the device parent ID field to the VIF struct. This way, the forwarding path can use the parent ID field without invoking switchdev calls, which requires the RTNL lock. When a new VIF is added, set the device parent ID field in it by invoking the switchdev_port_attr_get call. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a844738b38bd..1b161ada7ae6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -67,6 +67,7 @@ #include #include #include +#include struct ipmr_rule { struct fib_rule common; @@ -868,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; @@ -942,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, /* Fill in the VIF structures */ + attr.orig_dev = dev; + if (!switchdev_port_attr_get(dev, &attr)) { + memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len); + v->dev_parent_id.id_len = attr.u.ppid.id_len; + } else { + v->dev_parent_id.id_len = 0; + } v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; -- cgit v1.2.3 From a5bc9294d70fe85729bb343eef281ccbe78ff119 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:08 +0200 Subject: ipv4: ipmr: Don't forward packets already forwarded by hardware Change the ipmr module to not forward packets if: - The packet is marked with the offload_mr_fwd_mark, and - Both input interface and output interface share the same parent ID. This way, a packet can go through partial multicast forwarding in the hardware, where it will be forwarded only to the devices that share the same parent ID (AKA, reside inside the same hardware). The kernel will forward the packet to all other interfaces. To do this, add the ipmr_offload_forward helper, which per skb, ingress VIF and egress VIF, returns whether the forwarding was offloaded to hardware. The ipmr_queue_xmit frees the skb and does not forward it if the result is a true value. All the forwarding path code compiles out when the CONFIG_NET_SWITCHDEV is not set. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1b161ada7ae6..b3ee01b0551b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1859,10 +1859,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } +#ifdef CONFIG_NET_SWITCHDEV +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + struct vif_device *out_vif = &mrt->vif_table[out_vifi]; + struct vif_device *in_vif = &mrt->vif_table[in_vifi]; + + if (!skb->offload_mr_fwd_mark) + return false; + if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) + return false; + return netdev_phys_item_id_same(&out_vif->dev_parent_id, + &in_vif->dev_parent_id); +} +#else +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + return false; +} +#endif + /* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *c, int vifi) + int in_vifi, struct sk_buff *skb, + struct mfc_cache *c, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -1883,6 +1906,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } + if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) + goto out_free; + if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, @@ -2060,8 +2086,8 @@ forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, - psend); + ipmr_queue_xmit(net, mrt, true_vifi, + skb2, cache, psend); } psend = ct; } @@ -2072,9 +2098,10 @@ last_forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb2, + cache, psend); } else { - ipmr_queue_xmit(net, mrt, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); return; } } -- cgit v1.2.3 From 6c5570016b972d9b1f0f6c2dca9cc0422b1f92bf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 2 Oct 2017 23:50:05 +0200 Subject: net: core: decouple ifalias get/set from rtnl lock Device alias can be set by either rtnetlink (rtnl is held) or sysfs. rtnetlink hold the rtnl mutex, sysfs acquires it for this purpose. Add an extra mutex for it and use rcu to protect concurrent accesses. This allows the sysfs path to not take rtnl and would later allow to not hold it when dumping ifalias. Based on suggestion from Eric Dumazet. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/dev.c | 52 +++++++++++++++++++++++++++++++++++++++------------- net/core/net-sysfs.c | 17 ++++++++--------- net/core/rtnetlink.c | 13 +++++++++++-- 3 files changed, 58 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index e350c768d4b5..1770097cfd86 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -188,6 +188,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id); DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +static DEFINE_MUTEX(ifalias_mutex); + /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); @@ -1265,29 +1267,53 @@ rollback: */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { - char *new_ifalias; - - ASSERT_RTNL(); + struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; - if (!len) { - kfree(dev->ifalias); - dev->ifalias = NULL; - return 0; + if (len) { + new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); + if (!new_alias) + return -ENOMEM; + + memcpy(new_alias->ifalias, alias, len); + new_alias->ifalias[len] = 0; } - new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); - if (!new_ifalias) - return -ENOMEM; - dev->ifalias = new_ifalias; - memcpy(dev->ifalias, alias, len); - dev->ifalias[len] = 0; + mutex_lock(&ifalias_mutex); + rcu_swap_protected(dev->ifalias, new_alias, + mutex_is_locked(&ifalias_mutex)); + mutex_unlock(&ifalias_mutex); + + if (new_alias) + kfree_rcu(new_alias, rcuhead); return len; } +/** + * dev_get_alias - get ifalias of a device + * @dev: device + * @alias: buffer to store name of ifalias + * @len: size of buffer + * + * get ifalias for a device. Caller must make sure dev cannot go + * away, e.g. rcu read lock or own a reference count to device. + */ +int dev_get_alias(const struct net_device *dev, char *name, size_t len) +{ + const struct dev_ifalias *alias; + int ret = 0; + + rcu_read_lock(); + alias = rcu_dereference(dev->ifalias); + if (alias) + ret = snprintf(name, len, "%s", alias->ifalias); + rcu_read_unlock(); + + return ret; +} /** * netdev_features_change - device changes features diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 927a6dcbad96..51d5836d8fb9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -391,10 +391,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); ret = dev_set_alias(netdev, buf, count); - rtnl_unlock(); return ret < 0 ? ret : len; } @@ -403,13 +400,12 @@ static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); + char tmp[IFALIASZ]; ssize_t ret = 0; - if (!rtnl_trylock()) - return restart_syscall(); - if (netdev->ifalias) - ret = sprintf(buf, "%s\n", netdev->ifalias); - rtnl_unlock(); + ret = dev_get_alias(netdev, tmp, sizeof(tmp)); + if (ret > 0) + ret = sprintf(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); @@ -1488,7 +1484,10 @@ static void netdev_release(struct device *d) BUG_ON(dev->reg_state != NETREG_RELEASED); - kfree(dev->ifalias); + /* no need to wait for rcu grace period: + * device is dead and about to be freed. + */ + kfree(rcu_access_pointer(dev->ifalias)); netdev_freemem(dev); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e6955da0d58d..3961f87cdc76 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1366,6 +1366,16 @@ static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev) return nla_put_u32(skb, IFLA_LINK, ifindex); } +static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, + struct net_device *dev) +{ + char buf[IFALIASZ]; + int ret; + + ret = dev_get_alias(dev, buf, sizeof(buf)); + return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; +} + static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev) { @@ -1425,8 +1435,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (dev->qdisc && nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || - (dev->ifalias && - nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || + nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_changes)) || nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) -- cgit v1.2.3 From 1ae2eaaa229bc350b6f38fbf4ab9c873532aecfb Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:08 -0300 Subject: sctp: silence warns on sctp_stream_init allocations As SCTP supports up to 65535 streams, that can lead to very large allocations in sctp_stream_init(). As Xin Long noticed, systems with small amounts of memory are more prone to not have enough memory and dump warnings on dmesg initiated by user actions. Thus, silence them. Also, if the reallocation of stream->out is not necessary, skip it and keep the memory we already have. Reported-by: Xin Long Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 63ea15503714..1afa95558083 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -40,9 +40,14 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, { int i; + gfp |= __GFP_NOWARN; + /* Initial stream->out size may be very big, so free it and alloc - * a new one with new outcnt to save memory. + * a new one with new outcnt to save memory if needed. */ + if (outcnt == stream->outcnt) + goto in; + kfree(stream->out); stream->out = kcalloc(outcnt, sizeof(*stream->out), gfp); @@ -53,6 +58,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; +in: if (!incnt) return 0; -- cgit v1.2.3 From e090abd0d81c7bdbf0c0ba2224624be2b15ded0d Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:09 -0300 Subject: sctp: factor out stream->out allocation There is 1 place allocating it and 2 other reallocating. Move such procedures to a common function. Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 52 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 1afa95558083..6d0e997d301f 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -35,6 +35,30 @@ #include #include +static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, + gfp_t gfp) +{ + struct sctp_stream_out *out; + + out = kmalloc_array(outcnt, sizeof(*out), gfp); + if (!out) + return -ENOMEM; + + if (stream->out) { + memcpy(out, stream->out, min(outcnt, stream->outcnt) * + sizeof(*out)); + kfree(stream->out); + } + + if (outcnt > stream->outcnt) + memset(out + stream->outcnt, 0, + (outcnt - stream->outcnt) * sizeof(*out)); + + stream->out = out; + + return 0; +} + int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { @@ -48,11 +72,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, if (outcnt == stream->outcnt) goto in; - kfree(stream->out); - - stream->out = kcalloc(outcnt, sizeof(*stream->out), gfp); - if (!stream->out) - return -ENOMEM; + i = sctp_stream_alloc_out(stream, outcnt, gfp); + if (i) + return i; stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) @@ -276,15 +298,9 @@ int sctp_send_add_streams(struct sctp_association *asoc, } if (out) { - struct sctp_stream_out *streamout; - - streamout = krealloc(stream->out, outcnt * sizeof(*streamout), - GFP_KERNEL); - if (!streamout) + retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL); + if (retval) goto out; - - memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); - stream->out = streamout; } chunk = sctp_make_strreset_addstrm(asoc, out, in); @@ -682,10 +698,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; - struct sctp_stream_out *streamout; struct sctp_chunk *chunk = NULL; __u32 request_seq, outcnt; __u16 out, i; + int ret; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || @@ -714,14 +730,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( if (!out || outcnt > SCTP_MAX_STREAM) goto out; - streamout = krealloc(stream->out, outcnt * sizeof(*streamout), - GFP_ATOMIC); - if (!streamout) + ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC); + if (ret) goto out; - memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); - stream->out = streamout; - chunk = sctp_make_strreset_addstrm(asoc, out, 0); if (!chunk) goto out; -- cgit v1.2.3 From 1fdb8d8fefe2e7320ea15a65051758a4c4332f05 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:10 -0300 Subject: sctp: factor out stream->in allocation There is 1 place allocating it and another reallocating. Move such procedures to a common function. v2: updated changelog Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 6d0e997d301f..952437d656cc 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -59,6 +59,31 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, return 0; } +static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, + gfp_t gfp) +{ + struct sctp_stream_in *in; + + in = kmalloc_array(incnt, sizeof(*stream->in), gfp); + + if (!in) + return -ENOMEM; + + if (stream->in) { + memcpy(in, stream->in, min(incnt, stream->incnt) * + sizeof(*in)); + kfree(stream->in); + } + + if (incnt > stream->incnt) + memset(in + stream->incnt, 0, + (incnt - stream->incnt) * sizeof(*in)); + + stream->in = in; + + return 0; +} + int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { @@ -84,8 +109,8 @@ in: if (!incnt) return 0; - stream->in = kcalloc(incnt, sizeof(*stream->in), gfp); - if (!stream->in) { + i = sctp_stream_alloc_in(stream, incnt, gfp); + if (i) { kfree(stream->out); stream->out = NULL; return -ENOMEM; @@ -623,7 +648,6 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; - struct sctp_stream_in *streamin; __u32 request_seq, incnt; __u16 in, i; @@ -670,13 +694,9 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( if (!in || incnt > SCTP_MAX_STREAM) goto out; - streamin = krealloc(stream->in, incnt * sizeof(*streamin), - GFP_ATOMIC); - if (!streamin) + if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) goto out; - memset(streamin + stream->incnt, 0, in * sizeof(*streamin)); - stream->in = streamin; stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; -- cgit v1.2.3 From f952be79cebd49d04154781d99408867a069d375 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:11 -0300 Subject: sctp: introduce struct sctp_stream_out_ext With the stream schedulers, sctp_stream_out will become too big to be allocated by kmalloc and as we need to allocate with BH disabled, we cannot use __vmalloc in sctp_stream_init(). This patch moves out the stats from sctp_stream_out to sctp_stream_out_ext, which will be allocated only when the application tries to sendmsg something on it. Just the introduction of sctp_stream_out_ext would already fix the issue described above by splitting the allocation in two. Moving the stats to it also reduces the pressure on the allocator as we will ask for less memory atomically when creating the socket and we will use GFP_KERNEL later. Then, for stream schedulers, we will just use sctp_stream_out_ext. Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/chunk.c | 6 +++--- net/sctp/outqueue.c | 4 ++-- net/sctp/socket.c | 27 +++++++++++++++++++++------ net/sctp/stream.c | 16 ++++++++++++++++ 4 files changed, 42 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 3afac275ee82..7b261afc47b9 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -311,10 +311,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(TTL)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; - streamout->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; + streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && @@ -323,7 +323,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 2966ff400755..746b07b7937d 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -366,7 +366,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (!chk->tsn_gap_acked) { if (chk->transport) @@ -404,7 +404,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_stream_out *streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; - streamout->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; } msg_len -= SCTP_DATA_SNDSIZE(chk) + diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d4730ada7f32..d207734326b0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1927,6 +1927,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_free; } + /* Allocate sctp_stream_out_ext if not already done */ + if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { + err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); + if (err) + goto out_free; + } + if (sctp_wspace(asoc) < msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); @@ -6645,7 +6652,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct sctp_stream_out *streamout; + struct sctp_stream_out_ext *streamoute; struct sctp_association *asoc; struct sctp_prstatus params; int retval = -EINVAL; @@ -6668,21 +6675,29 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) goto out; - streamout = &asoc->stream.out[params.sprstat_sid]; + streamoute = asoc->stream.out[params.sprstat_sid].ext; + if (!streamoute) { + /* Not allocated yet, means all stats are 0 */ + params.sprstat_abandoned_unsent = 0; + params.sprstat_abandoned_sent = 0; + retval = 0; + goto out; + } + if (policy == SCTP_PR_SCTP_NONE) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { params.sprstat_abandoned_unsent += - streamout->abandoned_unsent[policy]; + streamoute->abandoned_unsent[policy]; params.sprstat_abandoned_sent += - streamout->abandoned_sent[policy]; + streamoute->abandoned_sent[policy]; } } else { params.sprstat_abandoned_unsent = - streamout->abandoned_unsent[__SCTP_PR_INDEX(policy)]; + streamoute->abandoned_unsent[__SCTP_PR_INDEX(policy)]; params.sprstat_abandoned_sent = - streamout->abandoned_sent[__SCTP_PR_INDEX(policy)]; + streamoute->abandoned_sent[__SCTP_PR_INDEX(policy)]; } if (put_user(len, optlen) || copy_to_user(optval, ¶ms, len)) { diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 952437d656cc..055ca25bbc91 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -121,8 +121,24 @@ in: return 0; } +int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) +{ + struct sctp_stream_out_ext *soute; + + soute = kzalloc(sizeof(*soute), GFP_KERNEL); + if (!soute) + return -ENOMEM; + stream->out[sid].ext = soute; + + return 0; +} + void sctp_stream_free(struct sctp_stream *stream) { + int i; + + for (i = 0; i < stream->outcnt; i++) + kfree(stream->out[i].ext); kfree(stream->out); kfree(stream->in); } -- cgit v1.2.3 From 5bbbbe32a43199c2b9ea5ea66fab6241c64beb51 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:13 -0300 Subject: sctp: introduce stream scheduler foundations This patch introduces the hooks necessary to do stream scheduling, as per RFC Draft ndata. It also introduces the first scheduler, which is what we do today but now factored out: first come first served (FCFS). With stream scheduling now we have to track which chunk was enqueued on which stream and be able to select another other than the in front of the main outqueue. So we introduce a list on sctp_stream_out_ext structure for this purpose. We reuse sctp_chunk->transmitted_list space for the list above, as the chunk cannot belong to the two lists at the same time. By using the union in there, we can have distinct names for these moments. sctp_sched_ops are the operations expected to be implemented by each scheduler. The dequeueing is a bit particular to this implementation but it is to match how we dequeue packets today. We first dequeue and then check if it fits the packet and if not, we requeue it at head. Thus why we don't have a peek operation but have dequeue_done instead, which is called once the chunk can be safely considered as transmitted. The check removed from sctp_outq_flush is now performed by sctp_stream_outq_migrate, which is only called during assoc setup. (sctp_sendmsg() also checks for it) The only operation that is foreseen but not yet added here is a way to signalize that a new packet is starting or that the packet is done, for round robin scheduler per packet, but is intentionally left to the patch that actually implements it. Support for I-DATA chunks, also described in this RFC, with user message interleaving is straightforward as it just requires the schedulers to probe for the feature and ignore datamsg boundaries when dequeueing. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/Makefile | 2 +- net/sctp/outqueue.c | 59 ++++++----- net/sctp/sm_sideeffect.c | 3 + net/sctp/stream.c | 88 +++++++++++++-- net/sctp/stream_sched.c | 270 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 387 insertions(+), 35 deletions(-) create mode 100644 net/sctp/stream_sched.c (limited to 'net') diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 70f1b570bab9..0f6e6d1d69fd 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -12,7 +12,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ - offload.o + offload.o stream_sched.o sctp_probe-y := probe.o diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 746b07b7937d..4db012aa25f7 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -50,6 +50,7 @@ #include #include +#include /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); @@ -72,32 +73,38 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, - struct sctp_chunk *ch) + struct sctp_chunk *ch) { + struct sctp_stream_out_ext *oute; + __u16 stream; + list_add(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; + + stream = sctp_chunk_stream_no(ch); + oute = q->asoc->stream.out[stream].ext; + list_add(&ch->stream_list, &oute->outq); } /* Take data from the front of the queue. */ static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) { - struct sctp_chunk *ch = NULL; - - if (!list_empty(&q->out_chunk_list)) { - struct list_head *entry = q->out_chunk_list.next; - - ch = list_entry(entry, struct sctp_chunk, list); - list_del_init(entry); - q->out_qlen -= ch->skb->len; - } - return ch; + return q->sched->dequeue(q); } + /* Add data chunk to the end of the queue. */ static inline void sctp_outq_tail_data(struct sctp_outq *q, struct sctp_chunk *ch) { + struct sctp_stream_out_ext *oute; + __u16 stream; + list_add_tail(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; + + stream = sctp_chunk_stream_no(ch); + oute = q->asoc->stream.out[stream].ext; + list_add_tail(&ch->stream_list, &oute->outq); } /* @@ -207,6 +214,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); + sctp_sched_set_sched(asoc, SCTP_SS_FCFS); } /* Free the outqueue structure and any related pending chunks. @@ -258,6 +266,7 @@ static void __sctp_outq_teardown(struct sctp_outq *q) /* Throw away any leftover data chunks. */ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { + sctp_sched_dequeue_done(q, chunk); /* Mark as send failure. */ sctp_chunk_fail(chunk, q->error); @@ -391,13 +400,14 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; + q->sched->unsched_all(&asoc->stream); + list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; - list_del_init(&chk->list); - q->out_qlen -= chk->skb->len; + sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) { @@ -415,6 +425,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, break; } + q->sched->sched_all(&asoc->stream); + return msg_len; } @@ -1033,22 +1045,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); - /* RFC 2960 6.5 Every DATA chunk MUST carry a valid - * stream identifier. - */ - if (chunk->sinfo.sinfo_stream >= asoc->stream.outcnt) { - - /* Mark as failed send. */ - sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); - if (asoc->peer.prsctp_capable && - SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) - asoc->sent_cnt_removable--; - sctp_chunk_free(chunk); - continue; - } - /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { + sctp_sched_dequeue_done(q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; @@ -1070,6 +1069,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) new_transport = asoc->peer.active_path; if (new_transport->state == SCTP_UNCONFIRMED) { WARN_ONCE(1, "Attempt to send packet on unconfirmed path."); + sctp_sched_dequeue_done(q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; @@ -1133,6 +1133,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) else asoc->stats.oodchunks++; + /* Only now it's safe to consider this + * chunk as sent, sched-wise. + */ + sctp_sched_dequeue_done(q, chunk); + break; default: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e6a2974e020e..402bfbb888cd 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -50,6 +50,7 @@ #include #include #include +#include static int sctp_cmd_interpreter(enum sctp_event event_type, union sctp_subtype subtype, @@ -1089,6 +1090,8 @@ static void sctp_cmd_send_msg(struct sctp_association *asoc, list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_outq_tail(&asoc->outqueue, chunk, gfp); + + asoc->outqueue.sched->enqueue(&asoc->outqueue, msg); } diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 055ca25bbc91..5ea33a2c453b 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -32,8 +32,61 @@ * Xin Long */ +#include #include #include +#include + +/* Migrates chunks from stream queues to new stream queues if needed, + * but not across associations. Also, removes those chunks to streams + * higher than the new max. + */ +static void sctp_stream_outq_migrate(struct sctp_stream *stream, + struct sctp_stream *new, __u16 outcnt) +{ + struct sctp_association *asoc; + struct sctp_chunk *ch, *temp; + struct sctp_outq *outq; + int i; + + asoc = container_of(stream, struct sctp_association, stream); + outq = &asoc->outqueue; + + list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) { + __u16 sid = sctp_chunk_stream_no(ch); + + if (sid < outcnt) + continue; + + sctp_sched_dequeue_common(outq, ch); + /* No need to call dequeue_done here because + * the chunks are not scheduled by now. + */ + + /* Mark as failed send. */ + sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM); + if (asoc->peer.prsctp_capable && + SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; + + sctp_chunk_free(ch); + } + + if (new) { + /* Here we actually move the old ext stuff into the new + * buffer, because we want to keep it. Then + * sctp_stream_update will swap ->out pointers. + */ + for (i = 0; i < outcnt; i++) { + kfree(new->out[i].ext); + new->out[i].ext = stream->out[i].ext; + stream->out[i].ext = NULL; + } + } + + for (i = outcnt; i < stream->outcnt; i++) + kfree(stream->out[i].ext); +} static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, gfp_t gfp) @@ -87,7 +140,8 @@ static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { - int i; + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + int i, ret = 0; gfp |= __GFP_NOWARN; @@ -97,6 +151,11 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, if (outcnt == stream->outcnt) goto in; + /* Filter out chunks queued on streams that won't exist anymore */ + sched->unsched_all(stream); + sctp_stream_outq_migrate(stream, NULL, outcnt); + sched->sched_all(stream); + i = sctp_stream_alloc_out(stream, outcnt, gfp); if (i) return i; @@ -105,20 +164,27 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; + sched->init(stream); + in: if (!incnt) - return 0; + goto out; i = sctp_stream_alloc_in(stream, incnt, gfp); if (i) { - kfree(stream->out); - stream->out = NULL; - return -ENOMEM; + ret = -ENOMEM; + goto free; } stream->incnt = incnt; + goto out; - return 0; +free: + sched->free(stream); + kfree(stream->out); + stream->out = NULL; +out: + return ret; } int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) @@ -130,13 +196,15 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) return -ENOMEM; stream->out[sid].ext = soute; - return 0; + return sctp_sched_init_sid(stream, sid, GFP_KERNEL); } void sctp_stream_free(struct sctp_stream *stream) { + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; + sched->free(stream); for (i = 0; i < stream->outcnt; i++) kfree(stream->out[i].ext); kfree(stream->out); @@ -156,6 +224,10 @@ void sctp_stream_clear(struct sctp_stream *stream) void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + + sched->unsched_all(stream); + sctp_stream_outq_migrate(stream, new, new->outcnt); sctp_stream_free(stream); stream->out = new->out; @@ -163,6 +235,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) stream->outcnt = new->outcnt; stream->incnt = new->incnt; + sched->sched_all(stream); + new->out = NULL; new->in = NULL; } diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c new file mode 100644 index 000000000000..40a9a9de2b98 --- /dev/null +++ b/net/sctp/stream_sched.c @@ -0,0 +1,270 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers + * + * Written or modified by: + * Marcelo Ricardo Leitner + */ + +#include +#include +#include +#include + +/* First Come First Serve (a.k.a. FIFO) + * RFC DRAFT ndata Section 3.1 + */ +static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, + __u16 value, gfp_t gfp) +{ + return 0; +} + +static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + *value = 0; + return 0; +} + +static int sctp_sched_fcfs_init(struct sctp_stream *stream) +{ + return 0; +} + +static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + return 0; +} + +static void sctp_sched_fcfs_free(struct sctp_stream *stream) +{ +} + +static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ +} + +static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_chunk *ch = NULL; + struct list_head *entry; + + if (list_empty(&q->out_chunk_list)) + goto out; + + if (stream->out_curr) { + ch = list_entry(stream->out_curr->ext->outq.next, + struct sctp_chunk, stream_list); + } else { + entry = q->out_chunk_list.next; + ch = list_entry(entry, struct sctp_chunk, list); + } + + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *chunk) +{ +} + +static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) +{ +} + +static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) +{ +} + +static struct sctp_sched_ops sctp_sched_fcfs = { + .set = sctp_sched_fcfs_set, + .get = sctp_sched_fcfs_get, + .init = sctp_sched_fcfs_init, + .init_sid = sctp_sched_fcfs_init_sid, + .free = sctp_sched_fcfs_free, + .enqueue = sctp_sched_fcfs_enqueue, + .dequeue = sctp_sched_fcfs_dequeue, + .dequeue_done = sctp_sched_fcfs_dequeue_done, + .sched_all = sctp_sched_fcfs_sched_all, + .unsched_all = sctp_sched_fcfs_unsched_all, +}; + +/* API to other parts of the stack */ + +struct sctp_sched_ops *sctp_sched_ops[] = { + &sctp_sched_fcfs, +}; + +int sctp_sched_set_sched(struct sctp_association *asoc, + enum sctp_sched_type sched) +{ + struct sctp_sched_ops *n = sctp_sched_ops[sched]; + struct sctp_sched_ops *old = asoc->outqueue.sched; + struct sctp_datamsg *msg = NULL; + struct sctp_chunk *ch; + int i, ret = 0; + + if (old == n) + return ret; + + if (sched > SCTP_SS_MAX) + return -EINVAL; + + if (old) { + old->free(&asoc->stream); + + /* Give the next scheduler a clean slate. */ + for (i = 0; i < asoc->stream.outcnt; i++) { + void *p = asoc->stream.out[i].ext; + + if (!p) + continue; + + p += offsetofend(struct sctp_stream_out_ext, outq); + memset(p, 0, sizeof(struct sctp_stream_out_ext) - + offsetofend(struct sctp_stream_out_ext, outq)); + } + } + + asoc->outqueue.sched = n; + n->init(&asoc->stream); + for (i = 0; i < asoc->stream.outcnt; i++) { + if (!asoc->stream.out[i].ext) + continue; + + ret = n->init_sid(&asoc->stream, i, GFP_KERNEL); + if (ret) + goto err; + } + + /* We have to requeue all chunks already queued. */ + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + if (ch->msg == msg) + continue; + msg = ch->msg; + n->enqueue(&asoc->outqueue, msg); + } + + return ret; + +err: + n->free(&asoc->stream); + asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ + + return ret; +} + +int sctp_sched_get_sched(struct sctp_association *asoc) +{ + int i; + + for (i = 0; i <= SCTP_SS_MAX; i++) + if (asoc->outqueue.sched == sctp_sched_ops[i]) + return i; + + return 0; +} + +int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, + __u16 value, gfp_t gfp) +{ + if (sid >= asoc->stream.outcnt) + return -EINVAL; + + if (!asoc->stream.out[sid].ext) { + int ret; + + ret = sctp_stream_init_ext(&asoc->stream, sid); + if (ret) + return ret; + } + + return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); +} + +int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, + __u16 *value) +{ + if (sid >= asoc->stream.outcnt) + return -EINVAL; + + if (!asoc->stream.out[sid].ext) + return 0; + + return asoc->outqueue.sched->get(&asoc->stream, sid, value); +} + +void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) +{ + if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) { + struct sctp_stream_out *sout; + __u16 sid; + + /* datamsg is not finish, so save it as current one, + * in case application switch scheduler or a higher + * priority stream comes in. + */ + sid = sctp_chunk_stream_no(ch); + sout = &q->asoc->stream.out[sid]; + q->asoc->stream.out_curr = sout; + return; + } + + q->asoc->stream.out_curr = NULL; + q->sched->dequeue_done(q, ch); +} + +/* Auxiliary functions for the schedulers */ +void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) +{ + list_del_init(&ch->list); + list_del_init(&ch->stream_list); + q->out_qlen -= ch->skb->len; +} + +int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) +{ + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + + INIT_LIST_HEAD(&stream->out[sid].ext->outq); + return sched->init_sid(stream, sid, gfp); +} + +struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + + asoc = container_of(stream, struct sctp_association, stream); + + return asoc->outqueue.sched; +} -- cgit v1.2.3 From 13aa8770fe42d246c6f3a8eb814b85bccb428011 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:14 -0300 Subject: sctp: add sockopt to get/set stream scheduler As defined per RFC Draft ndata Section 4.3.2, named as SCTP_STREAM_SCHEDULER. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d207734326b0..ae35dbf2810f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -79,6 +79,7 @@ #include #include #include +#include /* Forward declarations for internal helper functions. */ static int sctp_writeable(struct sock *sk); @@ -3914,6 +3915,36 @@ out: return retval; } +static int sctp_setsockopt_scheduler(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_assoc_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_value > SCTP_SS_MAX) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + goto out; + + retval = sctp_sched_set_sched(asoc, params.assoc_value); + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4095,6 +4126,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_ADD_STREAMS: retval = sctp_setsockopt_add_streams(sk, optval, optlen); break; + case SCTP_STREAM_SCHEDULER: + retval = sctp_setsockopt_scheduler(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6793,6 +6827,43 @@ out: return retval; } +static int sctp_getsockopt_scheduler(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = sctp_sched_get_sched(asoc); + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6975,6 +7046,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_enable_strreset(sk, len, optval, optlen); break; + case SCTP_STREAM_SCHEDULER: + retval = sctp_getsockopt_scheduler(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; -- cgit v1.2.3 From 0ccdf3c7fdeda511b10def19505178a9d2d3fccd Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:15 -0300 Subject: sctp: add sockopt to get/set stream scheduler parameters As defined per RFC Draft ndata Section 4.3.3, named as SCTP_STREAM_SCHEDULER_VALUE. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ae35dbf2810f..88c28421ec15 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3945,6 +3945,34 @@ out: return retval; } +static int sctp_setsockopt_scheduler_value(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_stream_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + goto out; + + retval = sctp_sched_set_value(asoc, params.stream_id, + params.stream_value, GFP_KERNEL); + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4129,6 +4157,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_STREAM_SCHEDULER: retval = sctp_setsockopt_scheduler(sk, optval, optlen); break; + case SCTP_STREAM_SCHEDULER_VALUE: + retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6864,6 +6895,48 @@ out: return retval; } +static int sctp_getsockopt_scheduler_value(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_stream_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + retval = sctp_sched_get_value(asoc, params.stream_id, + ¶ms.stream_value); + if (retval) + goto out; + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ¶ms, len)) { + retval = -EFAULT; + goto out; + } + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7050,6 +7123,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_scheduler(sk, len, optval, optlen); break; + case SCTP_STREAM_SCHEDULER_VALUE: + retval = sctp_getsockopt_scheduler_value(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; -- cgit v1.2.3 From 637784ade221a3c8a7ecd0f583eddd95d6276b9a Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:16 -0300 Subject: sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/Makefile | 2 +- net/sctp/stream_sched.c | 3 + net/sctp/stream_sched_prio.c | 347 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 351 insertions(+), 1 deletion(-) create mode 100644 net/sctp/stream_sched_prio.c (limited to 'net') diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 0f6e6d1d69fd..647c9cfd4e95 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -12,7 +12,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ - offload.o stream_sched.o + offload.o stream_sched.o stream_sched_prio.o sctp_probe-y := probe.o diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 40a9a9de2b98..115ddb765169 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -121,8 +121,11 @@ static struct sctp_sched_ops sctp_sched_fcfs = { /* API to other parts of the stack */ +extern struct sctp_sched_ops sctp_sched_prio; + struct sctp_sched_ops *sctp_sched_ops[] = { &sctp_sched_fcfs, + &sctp_sched_prio, }; int sctp_sched_set_sched(struct sctp_association *asoc, diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c new file mode 100644 index 000000000000..384dbf3c8760 --- /dev/null +++ b/net/sctp/stream_sched_prio.c @@ -0,0 +1,347 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers + * + * Written or modified by: + * Marcelo Ricardo Leitner + */ + +#include +#include +#include +#include + +/* Priority handling + * RFC DRAFT ndata section 3.4 + */ + +static void sctp_sched_prio_unsched_all(struct sctp_stream *stream); + +static struct sctp_stream_priorities *sctp_sched_prio_new_head( + struct sctp_stream *stream, int prio, gfp_t gfp) +{ + struct sctp_stream_priorities *p; + + p = kmalloc(sizeof(*p), gfp); + if (!p) + return NULL; + + INIT_LIST_HEAD(&p->prio_sched); + INIT_LIST_HEAD(&p->active); + p->next = NULL; + p->prio = prio; + + return p; +} + +static struct sctp_stream_priorities *sctp_sched_prio_get_head( + struct sctp_stream *stream, int prio, gfp_t gfp) +{ + struct sctp_stream_priorities *p; + int i; + + /* Look into scheduled priorities first, as they are sorted and + * we can find it fast IF it's scheduled. + */ + list_for_each_entry(p, &stream->prio_list, prio_sched) { + if (p->prio == prio) + return p; + if (p->prio > prio) + break; + } + + /* No luck. So we search on all streams now. */ + for (i = 0; i < stream->outcnt; i++) { + if (!stream->out[i].ext) + continue; + + p = stream->out[i].ext->prio_head; + if (!p) + /* Means all other streams won't be initialized + * as well. + */ + break; + if (p->prio == prio) + return p; + } + + /* If not even there, allocate a new one. */ + return sctp_sched_prio_new_head(stream, prio, gfp); +} + +static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p) +{ + struct list_head *pos; + + pos = p->next->prio_list.next; + if (pos == &p->active) + pos = pos->next; + p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list); +} + +static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute) +{ + bool scheduled = false; + + if (!list_empty(&soute->prio_list)) { + struct sctp_stream_priorities *prio_head = soute->prio_head; + + /* Scheduled */ + scheduled = true; + + if (prio_head->next == soute) + /* Try to move to the next stream */ + sctp_sched_prio_next_stream(prio_head); + + list_del_init(&soute->prio_list); + + /* Also unsched the priority if this was the last stream */ + if (list_empty(&prio_head->active)) { + list_del_init(&prio_head->prio_sched); + /* If there is no stream left, clear next */ + prio_head->next = NULL; + } + } + + return scheduled; +} + +static void sctp_sched_prio_sched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + struct sctp_stream_priorities *prio, *prio_head; + + prio_head = soute->prio_head; + + /* Nothing to do if already scheduled */ + if (!list_empty(&soute->prio_list)) + return; + + /* Schedule the stream. If there is a next, we schedule the new + * one before it, so it's the last in round robin order. + * If there isn't, we also have to schedule the priority. + */ + if (prio_head->next) { + list_add(&soute->prio_list, prio_head->next->prio_list.prev); + return; + } + + list_add(&soute->prio_list, &prio_head->active); + prio_head->next = soute; + + list_for_each_entry(prio, &stream->prio_list, prio_sched) { + if (prio->prio > prio_head->prio) { + list_add(&prio_head->prio_sched, prio->prio_sched.prev); + return; + } + } + + list_add_tail(&prio_head->prio_sched, &stream->prio_list); +} + +static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, + __u16 prio, gfp_t gfp) +{ + struct sctp_stream_out *sout = &stream->out[sid]; + struct sctp_stream_out_ext *soute = sout->ext; + struct sctp_stream_priorities *prio_head, *old; + bool reschedule = false; + int i; + + prio_head = sctp_sched_prio_get_head(stream, prio, gfp); + if (!prio_head) + return -ENOMEM; + + reschedule = sctp_sched_prio_unsched(soute); + old = soute->prio_head; + soute->prio_head = prio_head; + if (reschedule) + sctp_sched_prio_sched(stream, soute); + + if (!old) + /* Happens when we set the priority for the first time */ + return 0; + + for (i = 0; i < stream->outcnt; i++) { + soute = stream->out[i].ext; + if (soute && soute->prio_head == old) + /* It's still in use, nothing else to do here. */ + return 0; + } + + /* No hits, we are good to free it. */ + kfree(old); + + return 0; +} + +static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + *value = stream->out[sid].ext->prio_head->prio; + return 0; +} + +static int sctp_sched_prio_init(struct sctp_stream *stream) +{ + INIT_LIST_HEAD(&stream->prio_list); + + return 0; +} + +static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + INIT_LIST_HEAD(&stream->out[sid].ext->prio_list); + return sctp_sched_prio_set(stream, sid, 0, gfp); +} + +static void sctp_sched_prio_free(struct sctp_stream *stream) +{ + struct sctp_stream_priorities *prio, *n; + LIST_HEAD(list); + int i; + + /* As we don't keep a list of priorities, to avoid multiple + * frees we have to do it in 3 steps: + * 1. unsched everyone, so the lists are free to use in 2. + * 2. build the list of the priorities + * 3. free the list + */ + sctp_sched_prio_unsched_all(stream); + for (i = 0; i < stream->outcnt; i++) { + if (!stream->out[i].ext) + continue; + prio = stream->out[i].ext->prio_head; + if (prio && list_empty(&prio->prio_sched)) + list_add(&prio->prio_sched, &list); + } + list_for_each_entry_safe(prio, n, &list, prio_sched) { + list_del_init(&prio->prio_sched); + kfree(prio); + } +} + +static void sctp_sched_prio_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ + struct sctp_stream *stream; + struct sctp_chunk *ch; + __u16 sid; + + ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); + sid = sctp_chunk_stream_no(ch); + stream = &q->asoc->stream; + sctp_sched_prio_sched(stream, stream->out[sid].ext); +} + +static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_stream_priorities *prio; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch = NULL; + + /* Bail out quickly if queue is empty */ + if (list_empty(&q->out_chunk_list)) + goto out; + + /* Find which chunk is next. It's easy, it's either the current + * one or the first chunk on the next active stream. + */ + if (stream->out_curr) { + soute = stream->out_curr->ext; + } else { + prio = list_entry(stream->prio_list.next, + struct sctp_stream_priorities, prio_sched); + soute = prio->next; + } + ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_prio_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + struct sctp_stream_priorities *prio; + struct sctp_stream_out_ext *soute; + __u16 sid; + + /* Last chunk on that msg, move to the next stream on + * this priority. + */ + sid = sctp_chunk_stream_no(ch); + soute = q->asoc->stream.out[sid].ext; + prio = soute->prio_head; + + sctp_sched_prio_next_stream(prio); + + if (list_empty(&soute->outq)) + sctp_sched_prio_unsched(soute); +} + +static void sctp_sched_prio_sched_all(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + struct sctp_stream_out *sout; + struct sctp_chunk *ch; + + asoc = container_of(stream, struct sctp_association, stream); + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + __u16 sid; + + sid = sctp_chunk_stream_no(ch); + sout = &stream->out[sid]; + if (sout->ext) + sctp_sched_prio_sched(stream, sout->ext); + } +} + +static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) +{ + struct sctp_stream_priorities *p, *tmp; + struct sctp_stream_out_ext *soute, *souttmp; + + list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched) + list_for_each_entry_safe(soute, souttmp, &p->active, prio_list) + sctp_sched_prio_unsched(soute); +} + +struct sctp_sched_ops sctp_sched_prio = { + .set = sctp_sched_prio_set, + .get = sctp_sched_prio_get, + .init = sctp_sched_prio_init, + .init_sid = sctp_sched_prio_init_sid, + .free = sctp_sched_prio_free, + .enqueue = sctp_sched_prio_enqueue, + .dequeue = sctp_sched_prio_dequeue, + .dequeue_done = sctp_sched_prio_dequeue_done, + .sched_all = sctp_sched_prio_sched_all, + .unsched_all = sctp_sched_prio_unsched_all, +}; -- cgit v1.2.3 From ac1ed8b82cd60ba8e7d84103ac1414b8c577c485 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:17 -0300 Subject: sctp: introduce round robin stream scheduler This patch introduces RFC Draft ndata section 3.2 Priority Based Scheduler (SCTP_SS_RR). Works by maintaining a list of enqueued streams and tracking the last one used to send data. When the datamsg is done, it switches to the next stream. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/Makefile | 3 +- net/sctp/stream_sched.c | 2 + net/sctp/stream_sched_rr.c | 201 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 net/sctp/stream_sched_rr.c (limited to 'net') diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 647c9cfd4e95..bf90c5397719 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -12,7 +12,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ - offload.o stream_sched.o stream_sched_prio.o + offload.o stream_sched.o stream_sched_prio.o \ + stream_sched_rr.o sctp_probe-y := probe.o diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 115ddb765169..03513a9fa110 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -122,10 +122,12 @@ static struct sctp_sched_ops sctp_sched_fcfs = { /* API to other parts of the stack */ extern struct sctp_sched_ops sctp_sched_prio; +extern struct sctp_sched_ops sctp_sched_rr; struct sctp_sched_ops *sctp_sched_ops[] = { &sctp_sched_fcfs, &sctp_sched_prio, + &sctp_sched_rr, }; int sctp_sched_set_sched(struct sctp_association *asoc, diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c new file mode 100644 index 000000000000..7612a438c5b9 --- /dev/null +++ b/net/sctp/stream_sched_rr.c @@ -0,0 +1,201 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers + * + * Written or modified by: + * Marcelo Ricardo Leitner + */ + +#include +#include +#include +#include + +/* Priority handling + * RFC DRAFT ndata section 3.2 + */ +static void sctp_sched_rr_unsched_all(struct sctp_stream *stream); + +static void sctp_sched_rr_next_stream(struct sctp_stream *stream) +{ + struct list_head *pos; + + pos = stream->rr_next->rr_list.next; + if (pos == &stream->rr_list) + pos = pos->next; + stream->rr_next = list_entry(pos, struct sctp_stream_out_ext, rr_list); +} + +static void sctp_sched_rr_unsched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + if (stream->rr_next == soute) + /* Try to move to the next stream */ + sctp_sched_rr_next_stream(stream); + + list_del_init(&soute->rr_list); + + /* If we have no other stream queued, clear next */ + if (list_empty(&stream->rr_list)) + stream->rr_next = NULL; +} + +static void sctp_sched_rr_sched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + if (!list_empty(&soute->rr_list)) + /* Already scheduled. */ + return; + + /* Schedule the stream */ + list_add_tail(&soute->rr_list, &stream->rr_list); + + if (!stream->rr_next) + stream->rr_next = soute; +} + +static int sctp_sched_rr_set(struct sctp_stream *stream, __u16 sid, + __u16 prio, gfp_t gfp) +{ + return 0; +} + +static int sctp_sched_rr_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + return 0; +} + +static int sctp_sched_rr_init(struct sctp_stream *stream) +{ + INIT_LIST_HEAD(&stream->rr_list); + stream->rr_next = NULL; + + return 0; +} + +static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + INIT_LIST_HEAD(&stream->out[sid].ext->rr_list); + + return 0; +} + +static void sctp_sched_rr_free(struct sctp_stream *stream) +{ + sctp_sched_rr_unsched_all(stream); +} + +static void sctp_sched_rr_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ + struct sctp_stream *stream; + struct sctp_chunk *ch; + __u16 sid; + + ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); + sid = sctp_chunk_stream_no(ch); + stream = &q->asoc->stream; + sctp_sched_rr_sched(stream, stream->out[sid].ext); +} + +static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch = NULL; + + /* Bail out quickly if queue is empty */ + if (list_empty(&q->out_chunk_list)) + goto out; + + /* Find which chunk is next */ + if (stream->out_curr) + soute = stream->out_curr->ext; + else + soute = stream->rr_next; + ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); + + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_rr_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + struct sctp_stream_out_ext *soute; + __u16 sid; + + /* Last chunk on that msg, move to the next stream */ + sid = sctp_chunk_stream_no(ch); + soute = q->asoc->stream.out[sid].ext; + + sctp_sched_rr_next_stream(&q->asoc->stream); + + if (list_empty(&soute->outq)) + sctp_sched_rr_unsched(&q->asoc->stream, soute); +} + +static void sctp_sched_rr_sched_all(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch; + + asoc = container_of(stream, struct sctp_association, stream); + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + __u16 sid; + + sid = sctp_chunk_stream_no(ch); + soute = stream->out[sid].ext; + if (soute) + sctp_sched_rr_sched(stream, soute); + } +} + +static void sctp_sched_rr_unsched_all(struct sctp_stream *stream) +{ + struct sctp_stream_out_ext *soute, *tmp; + + list_for_each_entry_safe(soute, tmp, &stream->rr_list, rr_list) + sctp_sched_rr_unsched(stream, soute); +} + +struct sctp_sched_ops sctp_sched_rr = { + .set = sctp_sched_rr_set, + .get = sctp_sched_rr_get, + .init = sctp_sched_rr_init, + .init_sid = sctp_sched_rr_init_sid, + .free = sctp_sched_rr_free, + .enqueue = sctp_sched_rr_enqueue, + .dequeue = sctp_sched_rr_dequeue, + .dequeue_done = sctp_sched_rr_dequeue_done, + .sched_all = sctp_sched_rr_sched_all, + .unsched_all = sctp_sched_rr_unsched_all, +}; -- cgit v1.2.3 From 522e89d60b62d16075a7bc96dd5eb2f3eb813635 Mon Sep 17 00:00:00 2001 From: simran singhal Date: Mon, 11 Sep 2017 21:52:37 +0200 Subject: netfilter: ipset: Compress return logic Simplify function returns by merging assignment and return into one command line. Signed-off-by: simran singhal Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_list_set.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 178d4eba013b..2fff6b5dc6f0 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -453,7 +453,6 @@ static size_t list_set_memsize(const struct list_set *map, size_t dsize) { struct set_elem *e; - size_t memsize; u32 n = 0; rcu_read_lock(); @@ -461,9 +460,7 @@ list_set_memsize(const struct list_set *map, size_t dsize) n++; rcu_read_unlock(); - memsize = sizeof(*map) + n * dsize; - - return memsize; + return (sizeof(*map) + n * dsize); } static int -- cgit v1.2.3 From 8851e799ffde0d85fab6716075975200c9b5094c Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 11 Sep 2017 21:52:38 +0200 Subject: netfilter: ipset: Fix sparse warnings Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_ipportnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 5ab1b99a53c2..24bf55860108 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -434,7 +434,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { - u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; -- cgit v1.2.3 From 63c2af90e502fca6cda2855541d43443666e337f Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Mon, 11 Sep 2017 21:52:39 +0200 Subject: netfilter: ipset: deduplicate prefixlen maps The prefixlen maps used here are identical, and have been since introduction. It seems to make sense to use a single large map, that the preprocessor will fill appropriately. Signed-off-by: Aaron Conole Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/pfxlen.c | 395 +++++++++++++++---------------------------- 1 file changed, 137 insertions(+), 258 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c index 1c8a42c1056c..d5be9c25fad6 100644 --- a/net/netfilter/ipset/pfxlen.c +++ b/net/netfilter/ipset/pfxlen.c @@ -3,6 +3,141 @@ /* Prefixlen maps for fast conversions, by Jan Engelhardt. */ +#ifdef E +#undef E +#endif + +#define PREFIXES_MAP \ + E(0x00000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0x80000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), \ + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), + #define E(a, b, c, d) \ {.ip6 = { \ htonl(a), htonl(b), \ @@ -13,135 +148,7 @@ * just use prefixlen_netmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_netmask_map[] = { - E(0x00000000, 0x00000000, 0x00000000, 0x00000000), - E(0x80000000, 0x00000000, 0x00000000, 0x00000000), - E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), + PREFIXES_MAP }; EXPORT_SYMBOL_GPL(ip_set_netmask_map); @@ -155,135 +162,7 @@ EXPORT_SYMBOL_GPL(ip_set_netmask_map); * just use prefixlen_hostmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_hostmask_map[] = { - E(0x00000000, 0x00000000, 0x00000000, 0x00000000), - E(0x80000000, 0x00000000, 0x00000000, 0x00000000), - E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), - E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), - E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), - E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), + PREFIXES_MAP }; EXPORT_SYMBOL_GPL(ip_set_hostmask_map); -- cgit v1.2.3 From 20e883204f0268b423799781e5efed786f8a11ba Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 13:56:50 +0200 Subject: net: core: fix kerneldoc comment net/core/dev.c:1306: warning: No description found for parameter 'name' net/core/dev.c:1306: warning: Excess function parameter 'alias' description in 'dev_get_alias' Fixes: 6c5570016b97 ("net: core: decouple ifalias get/set from rtnl lock") Reported-by: kbuild test robot Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1770097cfd86..454f05441546 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1295,7 +1295,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) /** * dev_get_alias - get ifalias of a device * @dev: device - * @alias: buffer to store name of ifalias + * @name: buffer to store name of ifalias * @len: size of buffer * * get ifalias for a device. Caller must make sure dev cannot go -- cgit v1.2.3 From e774d96b7d2c3489bfb5bbdc2b65ed41cd68d3d5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 15:55:29 +0200 Subject: rtnetlink: remove slave_validate callback no users in the tree. Signed-off-by: Florian Westphal Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3961f87cdc76..b63c5759641f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2631,12 +2631,6 @@ replay: return err; slave_data = slave_attr; } - if (m_ops->slave_validate) { - err = m_ops->slave_validate(tb, slave_data, - extack); - if (err < 0) - return err; - } } if (dev) { -- cgit v1.2.3 From 5c45121dc39026ab2139910e57cf933fd57d30f2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 15:58:49 +0200 Subject: rtnetlink: remove __rtnl_af_unregister switch the only caller to rtnl_af_unregister. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 14 +------------- net/ipv6/addrconf.c | 4 ++-- 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b63c5759641f..3fb1ca33cba4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -475,18 +475,6 @@ void rtnl_af_register(struct rtnl_af_ops *ops) } EXPORT_SYMBOL_GPL(rtnl_af_register); -/** - * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. - * @ops: struct rtnl_af_ops * to unregister - * - * The caller must hold the rtnl_mutex. - */ -void __rtnl_af_unregister(struct rtnl_af_ops *ops) -{ - list_del(&ops->list); -} -EXPORT_SYMBOL_GPL(__rtnl_af_unregister); - /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister @@ -494,7 +482,7 @@ EXPORT_SYMBOL_GPL(__rtnl_af_unregister); void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); - __rtnl_af_unregister(ops); + list_del(&ops->list); rtnl_unlock(); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f553f72d0bee..837418ff2d4b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6618,9 +6618,9 @@ void addrconf_cleanup(void) unregister_pernet_subsys(&addrconf_ops); ipv6_addr_label_cleanup(); - rtnl_lock(); + rtnl_af_unregister(&inet6_ops); - __rtnl_af_unregister(&inet6_ops); + rtnl_lock(); /* clean dev list */ for_each_netdev(&init_net, dev) { -- cgit v1.2.3 From c818fa9e288be5be7e360c33cf4f5e30f9fa206e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Oct 2017 10:48:35 -0700 Subject: net: cache skb_shinfo() in skb_try_coalesce() Compiler does not really know that skb_shinfo(to|from) are constants in skb_try_coalesce(), lets cache their values to shrink code. We might even take care of skb_zcopy() calls later. $ size net/core/skbuff.o.before net/core/skbuff.o text data bss dec hex filename 40727 1298 0 42025 a429 net/core/skbuff.o.before 40631 1298 0 41929 a3c9 net/core/skbuff.o Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d98c2e3ce2bf..822a90e56aea 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4767,6 +4767,7 @@ EXPORT_SYMBOL(kfree_skb_partial); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize) { + struct skb_shared_info *to_shinfo, *from_shinfo; int i, delta, len = from->len; *fragstolen = false; @@ -4781,7 +4782,9 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return true; } - if (skb_has_frag_list(to) || skb_has_frag_list(from)) + to_shinfo = skb_shinfo(to); + from_shinfo = skb_shinfo(from); + if (to_shinfo->frag_list || from_shinfo->frag_list) return false; if (skb_zcopy(to) || skb_zcopy(from)) return false; @@ -4790,8 +4793,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, struct page *page; unsigned int offset; - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags >= MAX_SKB_FRAGS) return false; if (skb_head_is_locked(from)) @@ -4802,12 +4805,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); - skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, + skb_fill_page_desc(to, to_shinfo->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; } else { - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags > MAX_SKB_FRAGS) return false; delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); @@ -4815,19 +4818,19 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, WARN_ON_ONCE(delta < len); - memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, - skb_shinfo(from)->frags, - skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); - skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; + memcpy(to_shinfo->frags + to_shinfo->nr_frags, + from_shinfo->frags, + from_shinfo->nr_frags * sizeof(skb_frag_t)); + to_shinfo->nr_frags += from_shinfo->nr_frags; if (!skb_cloned(from)) - skb_shinfo(from)->nr_frags = 0; + from_shinfo->nr_frags = 0; /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) - skb_frag_ref(from, i); + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); to->truesize += delta; to->len += len; -- cgit v1.2.3 From 6621dd29eb9b5e6774ec7a9a75161352fdea47fc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 3 Oct 2017 13:53:23 +0200 Subject: dev: advertise the new nsid when the netns iface changes x-netns interfaces are bound to two netns: the link netns and the upper netns. Usually, this kind of interfaces is created in the link netns and then moved to the upper netns. At the end, the interface is visible only in the upper netns. The link nsid is advertised via netlink in the upper netns, thus the user always knows where is the link part. There is no such mechanism in the link netns. When the interface is moved to another netns, the user cannot "follow" it. This patch adds a new netlink attribute which helps to follow an interface which moves to another netns. When the interface is unregistered, the new nsid is advertised. If the interface is a x-netns interface (ie rtnl_link_ops->get_link_net is defined), the nsid is allocated if needed. CC: Jason A. Donenfeld Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/dev.c | 11 ++++++++--- net/core/rtnetlink.c | 31 ++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 454f05441546..bffc75429184 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -145,6 +145,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -7204,7 +7205,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL); + GFP_KERNEL, NULL); /* * Flush the unicast and multicast chains @@ -8291,7 +8292,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - int err; + int err, new_nsid; ASSERT_RTNL(); @@ -8347,7 +8348,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) + new_nsid = peernet2id_alloc(dev_net(dev), net); + else + new_nsid = peernet2id(dev_net(dev), net); + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid); /* * Flush the unicast and multicast chains diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3fb1ca33cba4..1ee98b1369d5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -915,6 +915,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1384,7 +1385,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb, static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, - u32 event) + u32 event, int *new_nsid) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1472,6 +1473,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_fill_link_netnsid(skb, dev)) goto nla_put_failure; + if (new_nsid && + nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) + goto nla_put_failure; + if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) goto nla_put_failure; @@ -1701,7 +1706,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0); + ext_filter_mask, 0, NULL); if (err < 0) { if (likely(skb->len)) @@ -2808,7 +2813,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); + nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2893,7 +2898,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, - u32 event, gfp_t flags) + u32 event, gfp_t flags, int *new_nsid) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2904,7 +2909,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event, + new_nsid); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2927,14 +2933,14 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, - gfp_t flags) + gfp_t flags, int *new_nsid) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } @@ -2942,10 +2948,17 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { - rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags); + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL); } EXPORT_SYMBOL(rtmsg_ifinfo); +void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, + gfp_t flags, int *new_nsid) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + new_nsid); +} + static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, @@ -4321,7 +4334,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), - GFP_KERNEL); + GFP_KERNEL, NULL); break; default: break; -- cgit v1.2.3 From 51d0c04795a4b5d9a188336884887a9d394a94b0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:45 -0700 Subject: net: Add extack to netdev_notifier_info Add netlink_ext_ack to netdev_notifier_info to allow notifier handlers to return errors to userspace. Clean up the initialization in dev.c such that extack is easily added in subsequent patches where relevant. Specifically, remove the init call in call_netdevice_notifiers_info and have callers initalize on stack when info is declared. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 79 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index bffc75429184..e27a6bc0ac4d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -163,7 +163,6 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, - struct net_device *dev, struct netdev_notifier_info *info); static struct napi_struct *napi_by_id(unsigned int napi_id); @@ -1339,10 +1338,11 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - struct netdev_notifier_change_info change_info; + struct netdev_notifier_change_info change_info = { + .info.dev = dev, + }; - change_info.flags_changed = 0; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } @@ -1563,9 +1563,10 @@ EXPORT_SYMBOL(dev_disable_lro); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - netdev_notifier_info_init(&info, dev); return nb->notifier_call(nb, val, &info); } @@ -1690,11 +1691,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); */ static int call_netdevice_notifiers_info(unsigned long val, - struct net_device *dev, struct netdev_notifier_info *info) { ASSERT_RTNL(); - netdev_notifier_info_init(info, dev); return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -1709,9 +1708,11 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - return call_netdevice_notifiers_info(val, dev, &info); + return call_netdevice_notifiers_info(val, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -6278,7 +6279,15 @@ static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info) { - struct netdev_notifier_changeupper_info changeupper_info; + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + }, + .upper_dev = upper_dev, + .master = master, + .linking = true, + .upper_info = upper_info, + }; int ret = 0; ASSERT_RTNL(); @@ -6296,12 +6305,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - changeupper_info.upper_dev = upper_dev; - changeupper_info.master = master; - changeupper_info.linking = true; - changeupper_info.upper_info = upper_info; - - ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) @@ -6312,7 +6316,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) @@ -6376,20 +6380,24 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_notifier_changeupper_info changeupper_info; + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + }, + .upper_dev = upper_dev, + .linking = false, + }; ASSERT_RTNL(); - changeupper_info.upper_dev = upper_dev; changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; - changeupper_info.linking = false; - call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -6405,11 +6413,13 @@ EXPORT_SYMBOL(netdev_upper_dev_unlink); void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { - struct netdev_notifier_bonding_info info; + struct netdev_notifier_bonding_info info = { + .info.dev = dev, + }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); - call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, + call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); @@ -6535,11 +6545,13 @@ EXPORT_SYMBOL(dev_get_nest_level); void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { - struct netdev_notifier_changelowerstate_info changelowerstate_info; + struct netdev_notifier_changelowerstate_info changelowerstate_info = { + .info.dev = lower_dev, + }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; - call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); @@ -6830,11 +6842,14 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { - struct netdev_notifier_change_info change_info; - - change_info.flags_changed = changes; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, - &change_info.info); + struct netdev_notifier_change_info change_info = { + .info = { + .dev = dev, + }, + .flags_changed = changes, + }; + + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } -- cgit v1.2.3 From 33eaf2a6eb48ebf00374aaaf4b1b43f9950dcbe4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:46 -0700 Subject: net: Add extack to ndo_add_slave Pass extack to do_set_master and down to ndo_add_slave Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/batman-adv/soft-interface.c | 3 ++- net/bridge/br_device.c | 3 ++- net/core/rtnetlink.c | 10 ++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index c2c986746d0b..e7d5fbb6ad53 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -867,7 +867,8 @@ free_bat_counters: * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, - struct net_device *slave_dev) + struct net_device *slave_dev, + struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; struct net *net = dev_net(dev); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f6b6a92f1c48..cb0131d70ab1 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -320,7 +320,8 @@ void br_netpoll_disable(struct net_bridge_port *p) #endif -static int br_add_slave(struct net_device *dev, struct net_device *slave_dev) +static int br_add_slave(struct net_device *dev, struct net_device *slave_dev, + struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1ee98b1369d5..c5ee429bcce9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1957,7 +1957,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) return err; } -static int do_set_master(struct net_device *dev, int ifindex) +static int do_set_master(struct net_device *dev, int ifindex, + struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; @@ -1982,7 +1983,7 @@ static int do_set_master(struct net_device *dev, int ifindex) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { - err = ops->ndo_add_slave(upper_dev, dev); + err = ops->ndo_add_slave(upper_dev, dev, extack); if (err) return err; } else { @@ -2115,7 +2116,7 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -2753,7 +2754,8 @@ replay: goto out_unregister; } if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), + extack); if (err) goto out_unregister; } -- cgit v1.2.3 From 42ab19ee90292993370a30ad242599d75a3b749e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:47 -0700 Subject: net: Add extack to upper device linking Add extack arg to netdev_upper_dev_link and netdev_master_upper_dev_link Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/8021q/vlan.c | 6 +++--- net/8021q/vlan.h | 2 +- net/8021q/vlan_netlink.c | 2 +- net/batman-adv/hard-interface.c | 2 +- net/bridge/br_if.c | 2 +- net/core/dev.c | 15 ++++++++++----- net/openvswitch/vport-netdev.c | 3 ++- 7 files changed, 19 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 9649579b5b9f..71c3e045505b 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -138,7 +138,7 @@ int vlan_check_real_dev(struct net_device *real_dev, return 0; } -int register_vlan_dev(struct net_device *dev) +int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; @@ -174,7 +174,7 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; - err = netdev_upper_dev_link(real_dev, dev); + err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; @@ -270,7 +270,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; - err = register_vlan_dev(new_dev); + err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index df8bd65dd370..94f8eed9f9b3 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -107,7 +107,7 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id); void vlan_setup(struct net_device *dev); -int register_vlan_dev(struct net_device *dev); +int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 5e831de3103e..6e7c5a6a7930 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -160,7 +160,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; - return register_vlan_dev(dev); + return register_vlan_dev(dev, extack); } static inline size_t vlan_qos_map_size(unsigned int n) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index e348f76ea8c1..f7b413b9297e 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -738,7 +738,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, bat_priv = netdev_priv(hard_iface->soft_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, - soft_iface, NULL, NULL); + soft_iface, NULL, NULL, NULL); if (ret) goto err_dev; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f3aef22931ab..0a3fd727048d 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -540,7 +540,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev->priv_flags |= IFF_BRIDGE_PORT; - err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL); + err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, NULL); if (err) goto err5; diff --git a/net/core/dev.c b/net/core/dev.c index e27a6bc0ac4d..fcddccb6be41 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6277,11 +6277,13 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *upper_priv, void *upper_info) + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, + .extack = extack, }, .upper_dev = upper_dev, .master = master, @@ -6341,9 +6343,11 @@ rollback: * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + struct netlink_ext_ack *extack) { - return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); + return __netdev_upper_dev_link(dev, upper_dev, false, + NULL, NULL, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -6362,10 +6366,11 @@ EXPORT_SYMBOL(netdev_upper_dev_link); */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv, void *upper_info) + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) { return __netdev_upper_dev_link(dev, upper_dev, true, - upper_priv, upper_info); + upper_priv, upper_info, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 0389398fa4ab..2e5e7a41d8ef 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -108,7 +108,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, - get_dpdev(vport->dp), NULL, NULL); + get_dpdev(vport->dp), + NULL, NULL, NULL); if (err) goto error_unlock; -- cgit v1.2.3 From ca752be006013ac6f19721280f44c91ef07ad3d1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:50 -0700 Subject: net: bridge: Pass extack to down to netdev_master_upper_dev_link Pass extack arg to br_add_if. Add messages for a couple of failures and pass arg to netdev_master_upper_dev_link. Signed-off-by: David Ahern Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 2 +- net/bridge/br_if.c | 15 +++++++++++---- net/bridge/br_ioctl.c | 2 +- net/bridge/br_private.h | 3 ++- 4 files changed, 15 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index cb0131d70ab1..7acb77c9bd65 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -326,7 +326,7 @@ static int br_add_slave(struct net_device *dev, struct net_device *slave_dev, { struct net_bridge *br = netdev_priv(dev); - return br_add_if(br, slave_dev); + return br_add_if(br, slave_dev, extack); } static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 0a3fd727048d..59a74a414e20 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -480,7 +480,8 @@ netdev_features_t br_features_recompute(struct net_bridge *br, } /* called with RTNL */ -int br_add_if(struct net_bridge *br, struct net_device *dev) +int br_add_if(struct net_bridge *br, struct net_device *dev, + struct netlink_ext_ack *extack) { struct net_bridge_port *p; int err = 0; @@ -500,16 +501,22 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return -EINVAL; /* No bridging of bridges */ - if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) + if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { + NL_SET_ERR_MSG(extack, + "Can not enslave a bridge to a bridge"); return -ELOOP; + } /* Device is already being bridged */ if (br_port_exists(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ - if (dev->priv_flags & IFF_DONT_BRIDGE) + if (dev->priv_flags & IFF_DONT_BRIDGE) { + NL_SET_ERR_MSG(extack, + "Device does not allow enslaving to a bridge"); return -EOPNOTSUPP; + } p = new_nbp(br, dev); if (IS_ERR(p)) @@ -540,7 +547,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev->priv_flags |= IFF_BRIDGE_PORT; - err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, NULL); + err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack); if (err) goto err5; diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 66cd98772051..8f29103935a3 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -98,7 +98,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) return -EINVAL; if (isadd) - ret = br_add_if(br, dev); + ret = br_add_if(br, dev, NULL); else ret = br_del_if(br, dev); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 020c709a017f..ab4df24f7bba 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -566,7 +566,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, void br_port_carrier_check(struct net_bridge_port *p); int br_add_bridge(struct net *net, const char *name); int br_del_bridge(struct net *net, const char *name); -int br_add_if(struct net_bridge *br, struct net_device *dev); +int br_add_if(struct net_bridge *br, struct net_device *dev, + struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); int br_min_mtu(const struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, -- cgit v1.2.3 From 44f209807ee87a5eddf6c0432f3fb63cec27bad8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:50 -0400 Subject: VSOCK: export socket tables for sock_diag interface The socket table symbols need to be exported from vsock.ko so that the vsock_diag.ko module will be able to traverse sockets. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index dfc8c51e4d74..9afe4da8c67d 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -153,7 +153,6 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function * mods with VSOCK_HASH_SIZE to ensure this. */ -#define VSOCK_HASH_SIZE 251 #define MAX_PORT_RETRIES 24 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) @@ -168,9 +167,12 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); #define vsock_connected_sockets_vsk(vsk) \ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) -static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; -static struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; -static DEFINE_SPINLOCK(vsock_table_lock); +struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; +EXPORT_SYMBOL_GPL(vsock_bind_table); +struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; +EXPORT_SYMBOL_GPL(vsock_connected_table); +DEFINE_SPINLOCK(vsock_table_lock); +EXPORT_SYMBOL_GPL(vsock_table_lock); /* Autobind this socket to the local address if necessary. */ static int vsock_auto_bind(struct vsock_sock *vsk) -- cgit v1.2.3 From bf359b8127719535f88494adb3c2b73c06667dcd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:51 -0400 Subject: VSOCK: move __vsock_in_bound/connected_table() to af_vsock.h The vsock_diag.ko module will need to check socket table membership. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9afe4da8c67d..9b179a0081b3 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -250,16 +250,6 @@ static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, return NULL; } -static bool __vsock_in_bound_table(struct vsock_sock *vsk) -{ - return !list_empty(&vsk->bound_table); -} - -static bool __vsock_in_connected_table(struct vsock_sock *vsk) -{ - return !list_empty(&vsk->connected_table); -} - static void vsock_insert_unbound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); -- cgit v1.2.3 From 3b4477d2dcf2709d0be89e2a8dced3d0f4a017f2 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:52 -0400 Subject: VSOCK: use TCP state constants for sk_state There are two state fields: socket->state and sock->sk_state. The socket->state field uses SS_UNCONNECTED, SS_CONNECTED, etc while the sock->sk_state typically uses values that match TCP state constants (TCP_CLOSE, TCP_ESTABLISHED). AF_VSOCK does not follow this convention and instead uses SS_* constants for both fields. The sk_state field will be exposed to userspace through the vsock_diag interface for ss(8), netstat(8), and other programs. This patch switches sk_state to TCP state constants so that the meaning of this field is consistent with other address families. Not just AF_INET and AF_INET6 use the TCP constants, AF_UNIX and others do too. The following mapping was used to convert the code: SS_FREE -> TCP_CLOSE SS_UNCONNECTED -> TCP_CLOSE SS_CONNECTING -> TCP_SYN_SENT SS_CONNECTED -> TCP_ESTABLISHED SS_DISCONNECTING -> TCP_CLOSING VSOCK_SS_LISTEN -> TCP_LISTEN In __vsock_create() the sk_state initialization was dropped because sock_init_data() already initializes sk_state to TCP_CLOSE. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 46 ++++++++++++++++------------ net/vmw_vsock/hyperv_transport.c | 12 ++++---- net/vmw_vsock/virtio_transport.c | 2 +- net/vmw_vsock/virtio_transport_common.c | 22 ++++++------- net/vmw_vsock/vmci_transport.c | 34 ++++++++++---------- net/vmw_vsock/vmci_transport_notify.c | 2 +- net/vmw_vsock/vmci_transport_notify_qstate.c | 2 +- 7 files changed, 64 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9b179a0081b3..98359c19522f 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -36,7 +36,7 @@ * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this - * implementation because they are in the VSOCK_SS_LISTEN state. When a + * implementation because they are in the TCP_LISTEN state. When a * connection request is received (the second kind of socket mentioned above), * we create a new socket and refer to it as a pending socket. These pending * sockets are placed on the pending connection list of the listener socket. @@ -82,6 +82,15 @@ * argument, we must ensure the reference count is increased to ensure the * socket isn't freed before the function is run; the deferred function will * then drop the reference. + * + * - sk->sk_state uses the TCP state constants because they are widely used by + * other address families and exposed to userspace tools like ss(8): + * + * TCP_CLOSE - unconnected + * TCP_SYN_SENT - connecting + * TCP_ESTABLISHED - connected + * TCP_CLOSING - disconnecting + * TCP_LISTEN - listening */ #include @@ -477,7 +486,7 @@ void vsock_pending_work(struct work_struct *work) if (vsock_in_connected_table(vsk)) vsock_remove_connected(vsk); - sk->sk_state = SS_FREE; + sk->sk_state = TCP_CLOSE; out: release_sock(sk); @@ -617,7 +626,6 @@ struct sock *__vsock_create(struct net *net, sk->sk_destruct = vsock_sk_destruct; sk->sk_backlog_rcv = vsock_queue_rcv_skb; - sk->sk_state = 0; sock_reset_flag(sk, SOCK_DONE); INIT_LIST_HEAD(&vsk->bound_table); @@ -891,7 +899,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, /* Listening sockets that have connections in their accept * queue can be read. */ - if (sk->sk_state == VSOCK_SS_LISTEN + if (sk->sk_state == TCP_LISTEN && !vsock_is_accept_queue_empty(sk)) mask |= POLLIN | POLLRDNORM; @@ -920,7 +928,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, } /* Connected sockets that can produce data can be written. */ - if (sk->sk_state == SS_CONNECTED) { + if (sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( @@ -942,7 +950,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, * POLLOUT|POLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ - if (sk->sk_state == SS_UNCONNECTED) { + if (sk->sk_state == TCP_CLOSE) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= POLLOUT | POLLWRNORM; @@ -1112,9 +1120,9 @@ static void vsock_connect_timeout(struct work_struct *work) sk = sk_vsock(vsk); lock_sock(sk); - if (sk->sk_state == SS_CONNECTING && + if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); cancel = 1; @@ -1160,7 +1168,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, err = -EALREADY; break; default: - if ((sk->sk_state == VSOCK_SS_LISTEN) || + if ((sk->sk_state == TCP_LISTEN) || vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { err = -EINVAL; goto out; @@ -1183,7 +1191,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - sk->sk_state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; err = transport->connect(vsk); if (err < 0) @@ -1203,7 +1211,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) { + while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection @@ -1226,13 +1234,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (signal_pending(current)) { err = sock_intr_errno(timeout); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; } else if (timeout == 0) { err = -ETIMEDOUT; - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; @@ -1243,7 +1251,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (sk->sk_err) { err = -sk->sk_err; - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; } else { err = 0; @@ -1276,7 +1284,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, goto out; } - if (listener->sk_state != VSOCK_SS_LISTEN) { + if (listener->sk_state != TCP_LISTEN) { err = -EINVAL; goto out; } @@ -1366,7 +1374,7 @@ static int vsock_listen(struct socket *sock, int backlog) } sk->sk_max_ack_backlog = backlog; - sk->sk_state = VSOCK_SS_LISTEN; + sk->sk_state = TCP_LISTEN; err = 0; @@ -1546,7 +1554,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Callers should not provide a destination with stream sockets. */ if (msg->msg_namelen) { - err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP; + err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; } @@ -1557,7 +1565,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (sk->sk_state != SS_CONNECTED || + if (sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; @@ -1681,7 +1689,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, lock_sock(sk); - if (sk->sk_state != SS_CONNECTED) { + if (sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occured with the diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 14ed5a344cdf..bbac023e70d1 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -310,7 +310,7 @@ static void hvs_close_connection(struct vmbus_channel *chan) struct sock *sk = get_per_channel_state(chan); struct vsock_sock *vsk = vsock_sk(sk); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; @@ -344,8 +344,8 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (!sk) return; - if ((conn_from_host && sk->sk_state != VSOCK_SS_LISTEN) || - (!conn_from_host && sk->sk_state != SS_CONNECTING)) + if ((conn_from_host && sk->sk_state != TCP_LISTEN) || + (!conn_from_host && sk->sk_state != TCP_SYN_SENT)) goto out; if (conn_from_host) { @@ -357,7 +357,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (!new) goto out; - new->sk_state = SS_CONNECTING; + new->sk_state = TCP_SYN_SENT; vnew = vsock_sk(new); hvs_new = vnew->trans; hvs_new->chan = chan; @@ -384,7 +384,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) vmbus_set_chn_rescind_callback(chan, hvs_close_connection); if (conn_from_host) { - new->sk_state = SS_CONNECTED; + new->sk_state = TCP_ESTABLISHED; sk->sk_ack_backlog++; hvs_addr_init(&vnew->local_addr, if_type); @@ -399,7 +399,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) vsock_enqueue_accept(sk, new); release_sock(sk); } else { - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsock_sk(sk)); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 403d86e80162..8e03bd3f3668 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -414,7 +414,7 @@ static void virtio_vsock_event_fill(struct virtio_vsock *vsock) static void virtio_vsock_reset_sock(struct sock *sk) { lock_sock(sk); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); release_sock(sk); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index edba7ab97563..3ae3a33da70b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -708,7 +708,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); if (vsk->close_work_scheduled && @@ -748,8 +748,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk) { struct sock *sk = &vsk->sk; - if (!(sk->sk_state == SS_CONNECTED || - sk->sk_state == SS_DISCONNECTING)) + if (!(sk->sk_state == TCP_ESTABLISHED || + sk->sk_state == TCP_CLOSING)) return true; /* Already received SHUTDOWN from peer, reply with RST */ @@ -801,7 +801,7 @@ virtio_transport_recv_connecting(struct sock *sk, switch (le16_to_cpu(pkt->hdr.op)) { case VIRTIO_VSOCK_OP_RESPONSE: - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsk); sk->sk_state_change(sk); @@ -821,7 +821,7 @@ virtio_transport_recv_connecting(struct sock *sk, destroy: virtio_transport_reset(vsk, pkt); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk->sk_error_report(sk); return err; @@ -857,7 +857,7 @@ virtio_transport_recv_connected(struct sock *sk, vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK && vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; if (le32_to_cpu(pkt->hdr.flags)) sk->sk_state_change(sk); break; @@ -928,7 +928,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) lock_sock_nested(child, SINGLE_DEPTH_NESTING); - child->sk_state = SS_CONNECTED; + child->sk_state = TCP_ESTABLISHED; vchild = vsock_sk(child); vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid), @@ -1016,18 +1016,18 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) sk->sk_write_space(sk); switch (sk->sk_state) { - case VSOCK_SS_LISTEN: + case TCP_LISTEN: virtio_transport_recv_listen(sk, pkt); virtio_transport_free_pkt(pkt); break; - case SS_CONNECTING: + case TCP_SYN_SENT: virtio_transport_recv_connecting(sk, pkt); virtio_transport_free_pkt(pkt); break; - case SS_CONNECTED: + case TCP_ESTABLISHED: virtio_transport_recv_connected(sk, pkt); break; - case SS_DISCONNECTING: + case TCP_CLOSING: virtio_transport_recv_disconnecting(sk, pkt); virtio_transport_free_pkt(pkt); break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 0206155bff53..391775e3575c 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -742,7 +742,7 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) /* The local context ID may be out of date, update it. */ vsk->local_addr.svm_cid = dst.svm_cid; - if (sk->sk_state == SS_CONNECTED) + if (sk->sk_state == TCP_ESTABLISHED) vmci_trans(vsk)->notify_ops->handle_notify_pkt( sk, pkt, true, &dst, &src, &bh_process_pkt); @@ -800,7 +800,9 @@ static void vmci_transport_handle_detach(struct sock *sk) * left in our consume queue. */ if (vsock_stream_has_data(vsk) <= 0) { - if (sk->sk_state == SS_CONNECTING) { + sk->sk_state = TCP_CLOSE; + + if (sk->sk_state == TCP_SYN_SENT) { /* The peer may detach from a queue pair while * we are still in the connecting state, i.e., * if the peer VM is killed after attaching to @@ -809,12 +811,10 @@ static void vmci_transport_handle_detach(struct sock *sk) * event like a reset. */ - sk->sk_state = SS_UNCONNECTED; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); return; } - sk->sk_state = SS_UNCONNECTED; } sk->sk_state_change(sk); } @@ -882,17 +882,17 @@ static void vmci_transport_recv_pkt_work(struct work_struct *work) vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context; switch (sk->sk_state) { - case VSOCK_SS_LISTEN: + case TCP_LISTEN: vmci_transport_recv_listen(sk, pkt); break; - case SS_CONNECTING: + case TCP_SYN_SENT: /* Processing of pending connections for servers goes through * the listening socket, so see vmci_transport_recv_listen() * for that path. */ vmci_transport_recv_connecting_client(sk, pkt); break; - case SS_CONNECTED: + case TCP_ESTABLISHED: vmci_transport_recv_connected(sk, pkt); break; default: @@ -941,7 +941,7 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_sk(pending)->local_addr.svm_cid = pkt->dg.dst.context; switch (pending->sk_state) { - case SS_CONNECTING: + case TCP_SYN_SENT: err = vmci_transport_recv_connecting_server(sk, pending, pkt); @@ -1071,7 +1071,7 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_add_pending(sk, pending); sk->sk_ack_backlog++; - pending->sk_state = SS_CONNECTING; + pending->sk_state = TCP_SYN_SENT; vmci_trans(vpending)->produce_size = vmci_trans(vpending)->consume_size = qp_size; vmci_trans(vpending)->queue_pair_size = qp_size; @@ -1196,11 +1196,11 @@ vmci_transport_recv_connecting_server(struct sock *listener, * the socket will be valid until it is removed from the queue. * * If we fail sending the attach below, we remove the socket from the - * connected list and move the socket to SS_UNCONNECTED before + * connected list and move the socket to TCP_CLOSE before * releasing the lock, so a pending slow path processing of an incoming * packet will not see the socket in the connected state in that case. */ - pending->sk_state = SS_CONNECTED; + pending->sk_state = TCP_ESTABLISHED; vsock_insert_connected(vpending); @@ -1231,7 +1231,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, destroy: pending->sk_err = skerr; - pending->sk_state = SS_UNCONNECTED; + pending->sk_state = TCP_CLOSE; /* As long as we drop our reference, all necessary cleanup will handle * when the cleanup function drops its reference and our destruct * implementation is called. Note that since the listen handler will @@ -1269,7 +1269,7 @@ vmci_transport_recv_connecting_client(struct sock *sk, * accounting (it can already be found since it's in the bound * table). */ - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsk); sk->sk_state_change(sk); @@ -1337,7 +1337,7 @@ vmci_transport_recv_connecting_client(struct sock *sk, destroy: vmci_transport_send_reset(sk, pkt); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk->sk_error_report(sk); return err; @@ -1525,7 +1525,7 @@ static int vmci_transport_recv_connected(struct sock *sk, sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); break; @@ -1789,7 +1789,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) err = vmci_transport_send_conn_request( sk, vmci_trans(vsk)->queue_pair_size); if (err < 0) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; return err; } } else { @@ -1799,7 +1799,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) sk, vmci_trans(vsk)->queue_pair_size, supported_proto_versions); if (err < 0) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; return err; } diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c index 1406db4d97d1..41fb427f150a 100644 --- a/net/vmw_vsock/vmci_transport_notify.c +++ b/net/vmw_vsock/vmci_transport_notify.c @@ -355,7 +355,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, * queue. Ask for notifications when there is something to * read. */ - if (sk->sk_state == SS_CONNECTED) { + if (sk->sk_state == TCP_ESTABLISHED) { if (!send_waiting_read(sk, 1)) return -1; diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c index f3a0afc46208..0cc84f2bb05e 100644 --- a/net/vmw_vsock/vmci_transport_notify_qstate.c +++ b/net/vmw_vsock/vmci_transport_notify_qstate.c @@ -176,7 +176,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, * queue. Ask for notifications when there is something to * read. */ - if (sk->sk_state == SS_CONNECTED) + if (sk->sk_state == TCP_ESTABLISHED) vsock_block_update_write_window(sk); *data_ready_now = false; } -- cgit v1.2.3 From 413a4317aca7d6367d57a5971b0c461f03851207 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:53 -0400 Subject: VSOCK: add sock_diag interface This patch adds the sock_diag interface for querying sockets from userspace. Tools like ss(8) and netstat(8) can use this interface to list open sockets. The userspace ABI is defined in and includes netlink request and response structs. The request can query sockets based on their sk_state (e.g. listening sockets only) and the response contains socket information fields including the local/remote addresses, inode number, etc. This patch does not dump VMCI pending sockets because I have only tested the virtio transport, which does not use pending sockets. Support can be added later by extending vsock_diag_dump() if needed by VMCI users. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/Kconfig | 10 +++ net/vmw_vsock/Makefile | 3 + net/vmw_vsock/diag.c | 186 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 net/vmw_vsock/diag.c (limited to 'net') diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index a24369d175fd..970f96489fe7 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -15,6 +15,16 @@ config VSOCKETS To compile this driver as a module, choose M here: the module will be called vsock. If unsure, say N. +config VSOCKETS_DIAG + tristate "Virtual Sockets monitoring interface" + depends on VSOCKETS + default y + help + Support for PF_VSOCK sockets monitoring interface used by the ss tool. + If unsure, say Y. + + Enable this module so userspace applications can query open sockets. + config VMWARE_VMCI_VSOCKETS tristate "VMware VMCI transport for Virtual Sockets" depends on VSOCKETS && VMWARE_VMCI diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index e63d574234a9..64afc06805da 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_VSOCKETS) += vsock.o +obj-$(CONFIG_VSOCKETS_DIAG) += vsock_diag.o obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o @@ -6,6 +7,8 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o +vsock_diag-y += diag.o + vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ vmci_transport_notify_qstate.o diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c new file mode 100644 index 000000000000..31b567652250 --- /dev/null +++ b/net/vmw_vsock/diag.c @@ -0,0 +1,186 @@ +/* + * vsock sock_diag(7) module + * + * Copyright (C) 2017 Red Hat, Inc. + * Author: Stefan Hajnoczi + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_diag_msg *rep; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), + flags); + if (!nlh) + return -EMSGSIZE; + + rep = nlmsg_data(nlh); + rep->vdiag_family = AF_VSOCK; + + /* Lock order dictates that sk_lock is acquired before + * vsock_table_lock, so we cannot lock here. Simply don't take + * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is + * held. + */ + rep->vdiag_type = sk->sk_type; + rep->vdiag_state = sk->sk_state; + rep->vdiag_shutdown = sk->sk_shutdown; + rep->vdiag_src_cid = vsk->local_addr.svm_cid; + rep->vdiag_src_port = vsk->local_addr.svm_port; + rep->vdiag_dst_cid = vsk->remote_addr.svm_cid; + rep->vdiag_dst_port = vsk->remote_addr.svm_port; + rep->vdiag_ino = sock_i_ino(sk); + + sock_diag_save_cookie(sk, rep->vdiag_cookie); + + return 0; +} + +static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct vsock_diag_req *req; + struct vsock_sock *vsk; + unsigned int bucket; + unsigned int last_i; + unsigned int table; + struct net *net; + unsigned int i; + + req = nlmsg_data(cb->nlh); + net = sock_net(skb->sk); + + /* State saved between calls: */ + table = cb->args[0]; + bucket = cb->args[1]; + i = last_i = cb->args[2]; + + /* TODO VMCI pending sockets? */ + + spin_lock_bh(&vsock_table_lock); + + /* Bind table (locally created sockets) */ + if (table == 0) { + while (bucket < ARRAY_SIZE(vsock_bind_table)) { + struct list_head *head = &vsock_bind_table[bucket]; + + i = 0; + list_for_each_entry(vsk, head, bound_table) { + struct sock *sk = sk_vsock(vsk); + + if (!net_eq(sock_net(sk), net)) + continue; + if (i < last_i) + goto next_bind; + if (!(req->vdiag_states & (1 << sk->sk_state))) + goto next_bind; + if (sk_diag_fill(sk, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + goto done; +next_bind: + i++; + } + last_i = 0; + bucket++; + } + + table++; + bucket = 0; + } + + /* Connected table (accepted connections) */ + while (bucket < ARRAY_SIZE(vsock_connected_table)) { + struct list_head *head = &vsock_connected_table[bucket]; + + i = 0; + list_for_each_entry(vsk, head, connected_table) { + struct sock *sk = sk_vsock(vsk); + + /* Skip sockets we've already seen above */ + if (__vsock_in_bound_table(vsk)) + continue; + + if (!net_eq(sock_net(sk), net)) + continue; + if (i < last_i) + goto next_connected; + if (!(req->vdiag_states & (1 << sk->sk_state))) + goto next_connected; + if (sk_diag_fill(sk, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + goto done; +next_connected: + i++; + } + last_i = 0; + bucket++; + } + +done: + spin_unlock_bh(&vsock_table_lock); + + cb->args[0] = table; + cb->args[1] = bucket; + cb->args[2] = i; + + return skb->len; +} + +static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + int hdrlen = sizeof(struct vsock_diag_req); + struct net *net = sock_net(skb->sk); + + if (nlmsg_len(h) < hdrlen) + return -EINVAL; + + if (h->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = vsock_diag_dump, + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } + + return -EOPNOTSUPP; +} + +static const struct sock_diag_handler vsock_diag_handler = { + .family = AF_VSOCK, + .dump = vsock_diag_handler_dump, +}; + +static int __init vsock_diag_init(void) +{ + return sock_diag_register(&vsock_diag_handler); +} + +static void __exit vsock_diag_exit(void) +{ + sock_diag_unregister(&vsock_diag_handler); +} + +module_init(vsock_diag_init); +module_exit(vsock_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, + 40 /* AF_VSOCK */); -- cgit v1.2.3 From 27204aaa9dc67b833b77179fdac556288ec3a4bf Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 4 Oct 2017 10:03:44 -0700 Subject: tcp: uniform the set up of sockets after successful connection Currently in the TCP code, the initialization sequence for cached metrics, congestion control, BPF, etc, after successful connection is very inconsistent. This introduces inconsistent bevhavior and is prone to bugs. The current call sequence is as follows: (1) for active case (tcp_finish_connect() case): tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); (2) for passive case (tcp_rcv_state_process() TCP_SYN_RECV case): icsk->icsk_af_ops->rebuild_header(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_mtup_init(sk); tcp_init_buffer_space(sk); tcp_init_metrics(sk); (3) for TFO passive case (tcp_fastopen_create_child()): inet_csk(child)->icsk_af_ops->rebuild_header(child); tcp_init_congestion_control(child); tcp_mtup_init(child); tcp_init_metrics(child); tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_buffer_space(child); This commit uniforms the above functions to have the following sequence: tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE/PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); This sequence is the same as the (1) active case. We pick this sequence because this order correctly allows BPF to override the settings including congestion control module and initial cwnd, etc from the route, and then allows the CC module to see those settings. Suggested-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: Wei Wang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 12 ++++++++++++ net/ipv4/tcp_fastopen.c | 7 +------ net/ipv4/tcp_input.c | 21 +++------------------ 3 files changed, 16 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 23225c98d287..c115e37ca608 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -456,6 +456,18 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); +void tcp_init_transfer(struct sock *sk, int bpf_op) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + tcp_call_bpf(sk, bpf_op); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { if (tsflags && skb) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index de470e7e586f..29fff14d5a53 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -236,12 +236,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_buffer_space(child); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index db9bb46b5776..bd3a35f5dbf2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5513,20 +5513,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) security_inet_conn_established(sk, skb); } - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; - tcp_init_buffer_space(sk); - if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -5693,7 +5686,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tcp_is_sack(tp) && sysctl_tcp_fack) tcp_enable_fack(tp); - tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -5920,14 +5912,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); } else { - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); - - tcp_mtup_init(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; - tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5957,8 +5943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * are sent out. */ tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); + } if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); -- cgit v1.2.3 From 6d05081e55a5a55b32ae6609f1902eca8277541d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 4 Oct 2017 10:04:04 -0700 Subject: tcp: clean up TFO server's initial tcp_rearm_rto() call This commit does a cleanup and moves tcp_rearm_rto() call in the TFO server case into a previous spot in tcp_rcv_state_process() to make it more compact. This is only a cosmetic change. Suggested-by: Yuchung Cheng Signed-off-by: Wei Wang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bd3a35f5dbf2..c5b8d61846c2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5911,6 +5911,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (req) { inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); } else { tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; @@ -5933,18 +5942,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (req) { - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); - } - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); -- cgit v1.2.3 From 1bcdca3ffbbbed0d09b46cf4263ed2593a418375 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Wed, 4 Oct 2017 15:59:49 -0400 Subject: net/ipv4: Remove unused variable in route.c int rc is unmodified after initalization in net/ipv4/route.c, this patch simply cleans up that variable and returns 0. This was found with coccicheck M=net/ipv4/ on linus' tree. Signed-off-by: Tim Hansen Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ac6fde5d45f1..1c7ed77968c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3038,7 +3038,6 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { - int rc = 0; int cpu; ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); @@ -3095,7 +3094,7 @@ int __init ip_rt_init(void) #endif register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); - return rc; + return 0; } #ifdef CONFIG_SYSCTL -- cgit v1.2.3 From 9dff9936f0ccfffba5106ee4217c71c7bcf95143 Mon Sep 17 00:00:00 2001 From: Avinash Repaka Date: Wed, 4 Oct 2017 12:10:43 -0700 Subject: RDS: IB: Limit the scope of has_fr/has_fmr variables This patch fixes the scope of has_fr and has_fmr variables as they are needed only in rds_ib_add_one(). Signed-off-by: Avinash Repaka Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 11 ++++++----- net/rds/ib.h | 2 -- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index a0954ace3774..36dd2099048a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -126,6 +126,7 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; + bool has_fr, has_fmr; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) @@ -143,11 +144,11 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); - rds_ibdev->has_fr = (device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS); - rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && - device->map_phys_fmr && device->unmap_fmr); - rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr); + has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + rds_ibdev->use_fastreg = (has_fr && !has_fmr); rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? diff --git a/net/rds/ib.h b/net/rds/ib.h index bf4822407567..6ea6a27891b0 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -215,8 +215,6 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - bool has_fmr; - bool has_fr; bool use_fastreg; unsigned int max_mrs; -- cgit v1.2.3 From b1fb67fa501c4787035317f84db6caf013385581 Mon Sep 17 00:00:00 2001 From: Avinash Repaka Date: Wed, 4 Oct 2017 12:11:29 -0700 Subject: RDS: IB: Initialize max_items based on underlying device attributes Use max_1m_mrs/max_8k_mrs while setting max_items, as the former variables are set based on the underlying device attributes. Signed-off-by: Avinash Repaka Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 9a3c54e659e9..e678699268a2 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -601,11 +601,11 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; - pool->max_items = RDS_MR_1M_POOL_SIZE; + pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; - pool->max_items = RDS_MR_8K_POOL_SIZE; + pool->max_items = rds_ibdev->max_8k_mrs; } pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; -- cgit v1.2.3 From e2080072ed2d98a55ae69d95dea60ff7a17cddd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Oct 2017 12:59:58 -0700 Subject: tcp: new list for sent but unacked skbs for RACK recovery This patch adds a new queue (list) that tracks the sent but not yet acked or SACKed skbs for a TCP connection. The list is chronologically ordered by skb->skb_mstamp (the head is the oldest sent skb). This list will be used to optimize TCP Rack recovery, which checks an skb's timestamp to judge if it has been lost and needs to be retransmitted. Since TCP write queue is ordered by sequence instead of sent time, RACK has to scan over the write queue to catch all eligible packets to detect lost retransmission, and iterates through SACKed skbs repeatedly. Special cares for rare events: 1. TCP repair fakes skb transmission so the send queue needs adjusted 2. SACK reneging would require re-inserting SACKed skbs into the send queue. For now I believe it's not worth the complexity to make RACK work perfectly on SACK reneging, so we do nothing here. 3. Fast Open: currently for non-TFO, send-queue correctly queues the pure SYN packet. For TFO which queues a pure SYN and then a data packet, send-queue only queues the data packet but not the pure SYN due to the structure of TFO code. This is okay because the SYN receiver would never respond with a SACK on a missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK). In order to not grow sk_buff, we use an union for the new list and _skb_refdst/destructor fields. This is a bit complicated because we need to make sure _skb_refdst and destructor are properly zeroed before skb is cloned/copied at transmit, and before being freed. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 9 +++++++-- net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 42 +++++++++++++++++++++++++++++++----------- 4 files changed, 41 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c115e37ca608..8cf742fd4f99 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk) tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); + INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -881,6 +882,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, * available to the caller, no more, no less. */ skb->reserved_tailroom = skb->end - skb->tail - size; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c5b8d61846c2..fb0d7ed84b94 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_skb_pcount(skb), skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if (!before(shinfo->tskey, prior_snd_una) && - before(shinfo->tskey, tcp_sk(sk)->snd_una)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + before(shinfo->tskey, tcp_sk(sk)->snd_una)) { + tcp_skb_tsorted_save(skb) { + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + } tcp_skb_tsorted_restore(skb); + } } /* Remove acknowledged frames from the retransmission queue. If our packet diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 188a6f31356d..2341b9f857b6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0bc9e46a5369..8162e2880178 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) HRTIMER_MODE_ABS_PINNED); } +static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +{ + skb->skb_mstamp = tp->tcp_mstamp; + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; oskb = skb; - if (unlikely(skb_cloned(skb))) - skb = pskb_copy(skb, gfp_mask); - else - skb = skb_clone(skb, gfp_mask); + + tcp_skb_tsorted_save(oskb) { + if (unlikely(skb_cloned(oskb))) + skb = pskb_copy(oskb, gfp_mask); + else + skb = skb_clone(oskb, gfp_mask); + } tcp_skb_tsorted_restore(oskb); + if (unlikely(!skb)) return -ENOBUFS; } @@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = net_xmit_eval(err); } if (!err && oskb) { - oskb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, oskb); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); + list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } @@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); goto repair; /* Skip network transmission */ } @@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + tcp_skb_tsorted_save(skb) { + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } tcp_skb_tsorted_restore(skb); + if (!err) - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -3023,6 +3037,7 @@ coalesce: goto coalesce; return; } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ @@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk) } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + struct sk_buff *nskb; + + tcp_skb_tsorted_save(skb) { + nskb = skb_copy(skb, GFP_ATOMIC); + } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; + INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); tcp_unlink_write_queue(skb, sk); __skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); -- cgit v1.2.3 From 043b87d7599ed8e86a33f4cbc3f062d57e263711 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 4 Oct 2017 12:59:59 -0700 Subject: tcp: more efficient RACK loss detection Use the new time-ordered list to speed up RACK. The detection logic is identical. But since the list is chronologically ordered by skb_mstamp and contains only skbs not yet acked or sacked, RACK can abort the loop upon hitting skbs that were sent more recently. On YouTube servers this patch reduces the iterations on write queue by 40x. The improvement is even bigger with large BDP networks. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 449cd914d58e..8aa56caefde8 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; @@ -58,17 +58,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); - tcp_for_write_queue(skb, sk) { + list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, + tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - if (skb == tcp_send_head(sk)) - break; - - /* Skip ones already (s)acked */ - if (!after(scb->end_seq, tp->snd_una) || - scb->sacked & TCPCB_SACKED_ACKED) - continue; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, tp->rack.end_seq, scb->end_seq)) { /* Step 3 in draft-cheng-tcpm-rack-00.txt: @@ -81,6 +74,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) if (remaining < 0) { tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); continue; } @@ -91,11 +85,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - - } else if (!(scb->sacked & TCPCB_RETRANS)) { - /* Original data are sent sequentially so stop early - * b/c the rest are all sent after rack_sent - */ + } else { break; } } -- cgit v1.2.3 From bef06223083b81d2064824afe2bc85be416ab73a Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 4 Oct 2017 13:00:00 -0700 Subject: tcp: a small refactor of RACK loss detection Refactor the RACK loop to improve readability and speed up the checks. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 8aa56caefde8..cda6074a429a 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -61,32 +61,28 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + s32 remaining; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, - tp->rack.end_seq, scb->end_seq)) { - /* Step 3 in draft-cheng-tcpm-rack-00.txt: - * A packet is lost if its elapsed time is beyond - * the recent RTT plus the reordering window. - */ - u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, - skb->skb_mstamp); - s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - - if (remaining < 0) { - tcp_rack_mark_skb_lost(sk, skb); - list_del_init(&skb->tcp_tsorted_anchor); - continue; - } - - /* Skip ones marked lost but not yet retransmitted */ - if ((scb->sacked & TCPCB_LOST) && - !(scb->sacked & TCPCB_SACKED_RETRANS)) - continue; + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) + continue; + if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) + break; + + /* A packet is lost if it has not been s/acked beyond + * the recent RTT plus the reordering window. + */ + remaining = tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); + } else { /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - } else { - break; } } } -- cgit v1.2.3 From 380537b4f7f2e706e499695b38eabc90a5f72fa8 Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 02:07:08 +0800 Subject: net: ipv6: remove unused code in ipv6_find_hdr() Storing the left length of skb into 'len' actually has no effect so we can remove it. Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- net/ipv6/exthdrs_core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 115d60919f72..11025f8d124b 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -187,7 +187,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; - unsigned int len; bool found; if (fragoff) @@ -204,7 +203,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } - len = skb->len - start; do { struct ipv6_opt_hdr _hdr, *hp; @@ -273,7 +271,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, if (!found) { nexthdr = hp->nexthdr; - len -= hdrlen; start += hdrlen; } } while (!found); -- cgit v1.2.3 From cc71b7b071192ac1c288e272fdc3f3877eb96663 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Thu, 5 Oct 2017 15:45:32 -0400 Subject: net/ipv6: remove unused err variable on icmpv6_push_pending_frames int err is unused by icmpv6_push_pending_frames(), this patch returns removes the variable and returns the function with 0. git bisect shows this variable has been around since linux has been in git in commit 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2. This was found by running make coccicheck M=net/ipv6/ on linus' tree on commit 77ede3a014a32746002f7889211f0cecf4803163 (current HEAD as of this patch). Signed-off-by: Tim Hansen Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 5acb54405b10..aeb49b4d8c7d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -255,7 +255,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, { struct sk_buff *skb; struct icmp6hdr *icmp6h; - int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) @@ -288,7 +287,7 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, } ip6_push_pending_frames(sk); out: - return err; + return 0; } struct icmpv6_msg { -- cgit v1.2.3 From 4c02d62fa37a05254a87575c7d430819f77bd6c9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 5 Oct 2017 10:39:10 -0700 Subject: net/mac80211/mesh_plink: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This requires adding a pointer back to the sta_info since container_of() can't resolve the sta_info. Cc: Johannes Berg Cc: "David S. Miller" Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Cc: Thomas Gleixner Signed-off-by: Kees Cook Signed-off-by: Johannes Berg --- net/mac80211/mesh.h | 1 + net/mac80211/mesh_plink.c | 10 ++++------ net/mac80211/sta_info.c | 4 +++- net/mac80211/sta_info.h | 2 ++ 4 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 7e5f271e3c30..465b7853edc0 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -275,6 +275,7 @@ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, u8 *hw_addr, struct ieee802_11_elems *ie); bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie); u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata); +void mesh_plink_timer(struct timer_list *t); void mesh_plink_broken(struct sta_info *sta); u32 mesh_plink_deactivate(struct sta_info *sta); u32 mesh_plink_open(struct sta_info *sta); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index dc8e10f87207..e2d00cce3c17 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -603,8 +603,9 @@ out: ieee80211_mbss_info_change_notify(sdata, changed); } -static void mesh_plink_timer(unsigned long data) +void mesh_plink_timer(struct timer_list *t) { + struct mesh_sta *mesh = from_timer(mesh, t, plink_timer); struct sta_info *sta; u16 reason = 0; struct ieee80211_sub_if_data *sdata; @@ -616,7 +617,7 @@ static void mesh_plink_timer(unsigned long data) * del_timer_sync() this timer after having made sure * it cannot be readded (by deleting the plink.) */ - sta = (struct sta_info *) data; + sta = mesh->plink_sta; if (sta->sdata->local->quiescing) return; @@ -696,11 +697,8 @@ static void mesh_plink_timer(unsigned long data) static inline void mesh_plink_timer_set(struct sta_info *sta, u32 timeout) { - sta->mesh->plink_timer.expires = jiffies + msecs_to_jiffies(timeout); - sta->mesh->plink_timer.data = (unsigned long) sta; - sta->mesh->plink_timer.function = mesh_plink_timer; sta->mesh->plink_timeout = timeout; - add_timer(&sta->mesh->plink_timer); + mod_timer(&sta->mesh->plink_timer, jiffies + msecs_to_jiffies(timeout)); } static bool llid_in_use(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index ffcd25c4908c..9673e157bf8f 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -329,10 +329,12 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); if (!sta->mesh) goto free; + sta->mesh->plink_sta = sta; spin_lock_init(&sta->mesh->plink_lock); if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.user_mpm) - init_timer(&sta->mesh->plink_timer); + timer_setup(&sta->mesh->plink_timer, mesh_plink_timer, + 0); sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; } #endif diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index a35c964f6217..5c54acd10562 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -344,6 +344,7 @@ DECLARE_EWMA(mesh_fail_avg, 20, 8) * @plink_state: peer link state * @plink_timeout: timeout of peer link * @plink_timer: peer link watch timer + * @plink_sta: peer link watch timer's sta_info * @t_offset: timing offset relative to this host * @t_offset_setpoint: reference timing offset of this sta to be used when * calculating clockdrift @@ -356,6 +357,7 @@ DECLARE_EWMA(mesh_fail_avg, 20, 8) */ struct mesh_sta { struct timer_list plink_timer; + struct sta_info *plink_sta; s64 t_offset; s64 t_offset_setpoint; -- cgit v1.2.3 From 4e64b1ed15e25b8dcc2819c6d43dab72eb0bea26 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 5 Oct 2017 23:46:14 -0700 Subject: net/ipv6: Convert icmpv6_push_pending_frames to void commit cc71b7b07119 ("net/ipv6: remove unused err variable on icmpv6_push_pending_frames") exposed icmpv6_push_pending_frames return value not being used. Remove now unnecessary int err declarations and uses. Miscellanea: o Remove unnecessary goto and out: labels o Realign arguments Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 43 ++++++++++++++++++------------------------- net/ipv6/ping.c | 5 ++--- 2 files changed, 20 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index aeb49b4d8c7d..4e52d52a6752 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -250,15 +250,15 @@ static bool opt_unrec(struct sk_buff *skb, __u32 offset) return (*op & 0xC0) == 0x80; } -int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, - struct icmp6hdr *thdr, int len) +void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; skb = skb_peek(&sk->sk_write_queue); if (!skb) - goto out; + return; icmp6h = icmp6_hdr(skb); memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); @@ -286,8 +286,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, tmp_csum); } ip6_push_pending_frames(sk); -out: - return 0; } struct icmpv6_msg { @@ -437,7 +435,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, int iif = 0; int addr_type = 0; int len; - int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); if ((u8 *)hdr < skb->head || @@ -574,17 +571,16 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, rcu_read_lock(); idev = __in6_dev_get(skb->dev); - err = ip6_append_data(sk, icmpv6_getfrag, &msg, - len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), - &ipc6, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, &sockc_unused); - if (err) { + if (ip6_append_data(sk, icmpv6_getfrag, &msg, + len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), + &ipc6, &fl6, (struct rt6_info *)dst, + MSG_DONTWAIT, &sockc_unused)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, - len + sizeof(struct icmp6hdr)); + icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + len + sizeof(struct icmp6hdr)); } rcu_read_unlock(); out_dst_release: @@ -681,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct icmpv6_msg msg; struct dst_entry *dst; struct ipcm6_cookie ipc6; - int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); struct sockcm_cookie sockc_unused = {0}; @@ -718,8 +713,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - err = ip6_dst_lookup(net, sk, &dst, &fl6); - if (err) + if (ip6_dst_lookup(net, sk, &dst, &fl6)) goto out; dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) @@ -736,17 +730,16 @@ static void icmpv6_echo_reply(struct sk_buff *skb) ipc6.dontfrag = np->dontfrag; ipc6.opt = NULL; - err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT, - &sockc_unused); - - if (err) { + if (ip6_append_data(sk, icmpv6_getfrag, &msg, + skb->len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), &ipc6, &fl6, + (struct rt6_info *)dst, MSG_DONTWAIT, + &sockc_unused)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, - skb->len + sizeof(struct icmp6hdr)); + icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + skb->len + sizeof(struct icmp6hdr)); } dst_release(dst); out: diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index ac826dd338ff..d12c55dad7d1 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -154,9 +154,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, - (struct icmp6hdr *) &pfh.icmph, - len); + icmpv6_push_pending_frames(sk, &fl6, + (struct icmp6hdr *)&pfh.icmph, len); } release_sock(sk); -- cgit v1.2.3 From 47eb2ac809189e0a60ad78eec6db9e84004e11be Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 28 Sep 2017 17:14:51 +0300 Subject: Bluetooth: move ecdh allocation outside of ecdh_helper Before this change, a new crypto tfm was allocated, each time, for both key generation and shared secret computation. Allocate a single tfm for both cases. Signed-off-by: Tudor Ambarus Signed-off-by: Marcel Holtmann --- net/bluetooth/ecdh_helper.c | 32 ++++----------------- net/bluetooth/ecdh_helper.h | 8 ++++-- net/bluetooth/selftest.c | 29 +++++++++++++------ net/bluetooth/smp.c | 68 ++++++++++++++++++++++++++++++++++++--------- 4 files changed, 87 insertions(+), 50 deletions(-) (limited to 'net') diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c index c7b1a9aee579..ac2c7087921d 100644 --- a/net/bluetooth/ecdh_helper.c +++ b/net/bluetooth/ecdh_helper.c @@ -23,7 +23,6 @@ #include "ecdh_helper.h" #include -#include #include struct ecdh_completion { @@ -50,10 +49,9 @@ static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) out[i] = __swab64(in[ndigits - 1 - i]); } -bool compute_ecdh_secret(const u8 public_key[64], const u8 private_key[32], - u8 secret[32]) +bool compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], + const u8 private_key[32], u8 secret[32]) { - struct crypto_kpp *tfm; struct kpp_request *req; struct ecdh p; struct ecdh_completion result; @@ -66,16 +64,9 @@ bool compute_ecdh_secret(const u8 public_key[64], const u8 private_key[32], if (!tmp) return false; - tfm = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); - if (IS_ERR(tfm)) { - pr_err("alg: kpp: Failed to load tfm for kpp: %ld\n", - PTR_ERR(tfm)); - goto free_tmp; - } - req = kpp_request_alloc(tfm, GFP_KERNEL); if (!req) - goto free_kpp; + goto free_tmp; init_completion(&result.completion); @@ -126,16 +117,14 @@ free_all: kzfree(buf); free_req: kpp_request_free(req); -free_kpp: - crypto_free_kpp(tfm); free_tmp: kfree(tmp); return (err == 0); } -bool generate_ecdh_keys(u8 public_key[64], u8 private_key[32]) +bool generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], + u8 private_key[32]) { - struct crypto_kpp *tfm; struct kpp_request *req; struct ecdh p; struct ecdh_completion result; @@ -150,16 +139,9 @@ bool generate_ecdh_keys(u8 public_key[64], u8 private_key[32]) if (!tmp) return false; - tfm = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); - if (IS_ERR(tfm)) { - pr_err("alg: kpp: Failed to load tfm for kpp: %ld\n", - PTR_ERR(tfm)); - goto free_tmp; - } - req = kpp_request_alloc(tfm, GFP_KERNEL); if (!req) - goto free_kpp; + goto free_tmp; init_completion(&result.completion); @@ -218,8 +200,6 @@ free_all: kzfree(buf); free_req: kpp_request_free(req); -free_kpp: - crypto_free_kpp(tfm); free_tmp: kfree(tmp); return (err == 0); diff --git a/net/bluetooth/ecdh_helper.h b/net/bluetooth/ecdh_helper.h index 7a423faf76e5..5cde37d12fd9 100644 --- a/net/bluetooth/ecdh_helper.h +++ b/net/bluetooth/ecdh_helper.h @@ -20,8 +20,10 @@ * COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS * SOFTWARE IS DISCLAIMED. */ +#include #include -bool compute_ecdh_secret(const u8 pub_a[64], const u8 priv_b[32], - u8 secret[32]); -bool generate_ecdh_keys(u8 public_key[64], u8 private_key[32]); +bool compute_ecdh_secret(struct crypto_kpp *tfm, const u8 pub_a[64], + const u8 priv_b[32], u8 secret[32]); +bool generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], + u8 private_key[32]); diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c index 34a1227f4391..126bdc5a77a7 100644 --- a/net/bluetooth/selftest.c +++ b/net/bluetooth/selftest.c @@ -138,9 +138,9 @@ static const u8 dhkey_3[32] __initconst = { 0x7c, 0x1c, 0xf9, 0x49, 0xe6, 0xd7, 0xaa, 0x70, }; -static int __init test_ecdh_sample(const u8 priv_a[32], const u8 priv_b[32], - const u8 pub_a[64], const u8 pub_b[64], - const u8 dhkey[32]) +static int __init test_ecdh_sample(struct crypto_kpp *tfm, const u8 priv_a[32], + const u8 priv_b[32], const u8 pub_a[64], + const u8 pub_b[64], const u8 dhkey[32]) { u8 *tmp, *dhkey_a, *dhkey_b; int ret = 0; @@ -152,8 +152,8 @@ static int __init test_ecdh_sample(const u8 priv_a[32], const u8 priv_b[32], dhkey_a = &tmp[0]; dhkey_b = &tmp[32]; - compute_ecdh_secret(pub_b, priv_a, dhkey_a); - compute_ecdh_secret(pub_a, priv_b, dhkey_b); + compute_ecdh_secret(tfm, pub_b, priv_a, dhkey_a); + compute_ecdh_secret(tfm, pub_a, priv_b, dhkey_b); if (memcmp(dhkey_a, dhkey, 32)) { ret = -EINVAL; @@ -185,30 +185,43 @@ static const struct file_operations test_ecdh_fops = { static int __init test_ecdh(void) { + struct crypto_kpp *tfm; ktime_t calltime, delta, rettime; unsigned long long duration; int err; calltime = ktime_get(); - err = test_ecdh_sample(priv_a_1, priv_b_1, pub_a_1, pub_b_1, dhkey_1); + tfm = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + if (IS_ERR(tfm)) { + BT_ERR("Unable to create ECDH crypto context"); + err = PTR_ERR(tfm); + goto done; + } + + err = test_ecdh_sample(tfm, priv_a_1, priv_b_1, pub_a_1, pub_b_1, + dhkey_1); if (err) { BT_ERR("ECDH sample 1 failed"); goto done; } - err = test_ecdh_sample(priv_a_2, priv_b_2, pub_a_2, pub_b_2, dhkey_2); + err = test_ecdh_sample(tfm, priv_a_2, priv_b_2, pub_a_2, pub_b_2, + dhkey_2); if (err) { BT_ERR("ECDH sample 2 failed"); goto done; } - err = test_ecdh_sample(priv_a_3, priv_a_3, pub_a_3, pub_a_3, dhkey_3); + err = test_ecdh_sample(tfm, priv_a_3, priv_a_3, pub_a_3, pub_a_3, + dhkey_3); if (err) { BT_ERR("ECDH sample 3 failed"); goto done; } + crypto_free_kpp(tfm); + rettime = ktime_get(); delta = ktime_sub(rettime, calltime); duration = (unsigned long long) ktime_to_ns(delta) >> 10; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index a0ef89772c36..31b64bc3cf16 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -92,6 +93,7 @@ struct smp_dev { struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; + struct crypto_kpp *tfm_ecdh; }; struct smp_chan { @@ -131,6 +133,7 @@ struct smp_chan { struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; + struct crypto_kpp *tfm_ecdh; }; /* These debug key values are defined in the SMP section of the core @@ -574,7 +577,8 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) get_random_bytes(smp->local_sk, 32); /* Generate local key pair for Secure Connections */ - if (!generate_ecdh_keys(smp->local_pk, smp->local_sk)) + if (!generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk, + smp->local_sk)) return -EIO; /* This is unlikely, but we need to check that @@ -771,6 +775,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn) crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); + crypto_free_kpp(smp->tfm_ecdh); /* Ensure that we don't leave any debug key around if debug key * support hasn't been explicitly enabled. @@ -1391,16 +1396,19 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(smp->tfm_aes)) { BT_ERR("Unable to create AES crypto context"); - kzfree(smp); - return NULL; + goto zfree_smp; } smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(smp->tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_cipher(smp->tfm_aes); - kzfree(smp); - return NULL; + goto free_cipher; + } + + smp->tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + if (IS_ERR(smp->tfm_ecdh)) { + BT_ERR("Unable to create ECDH crypto context"); + goto free_shash; } smp->conn = conn; @@ -1413,6 +1421,14 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) hci_conn_hold(conn->hcon); return smp; + +free_shash: + crypto_free_shash(smp->tfm_cmac); +free_cipher: + crypto_free_cipher(smp->tfm_aes); +zfree_smp: + kzfree(smp); + return NULL; } static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16]) @@ -1903,7 +1919,8 @@ static u8 sc_send_public_key(struct smp_chan *smp) get_random_bytes(smp->local_sk, 32); /* Generate local key pair for Secure Connections */ - if (!generate_ecdh_keys(smp->local_pk, smp->local_sk)) + if (!generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk, + smp->local_sk)) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that @@ -2677,7 +2694,8 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) SMP_DBG("Remote Public Key X: %32phN", smp->remote_pk); SMP_DBG("Remote Public Key Y: %32phN", smp->remote_pk + 32); - if (!compute_ecdh_secret(smp->remote_pk, smp->local_sk, smp->dhkey)) + if (!compute_ecdh_secret(smp->tfm_ecdh, smp->remote_pk, smp->local_sk, + smp->dhkey)) return SMP_UNSPECIFIED; SMP_DBG("DHKey %32phN", smp->dhkey); @@ -3169,6 +3187,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) struct smp_dev *smp; struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; + struct crypto_kpp *tfm_ecdh; if (cid == L2CAP_CID_SMP_BREDR) { smp = NULL; @@ -3194,8 +3213,18 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) return ERR_CAST(tfm_cmac); } + tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + if (IS_ERR(tfm_ecdh)) { + BT_ERR("Unable to create ECDH crypto context"); + crypto_free_shash(tfm_cmac); + crypto_free_cipher(tfm_aes); + kzfree(smp); + return ERR_CAST(tfm_ecdh); + } + smp->tfm_aes = tfm_aes; smp->tfm_cmac = tfm_cmac; + smp->tfm_ecdh = tfm_ecdh; smp->min_key_size = SMP_MIN_ENC_KEY_SIZE; smp->max_key_size = SMP_MAX_ENC_KEY_SIZE; @@ -3205,6 +3234,7 @@ create_chan: if (smp) { crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); + crypto_free_kpp(smp->tfm_ecdh); kzfree(smp); } return ERR_PTR(-ENOMEM); @@ -3252,6 +3282,7 @@ static void smp_del_chan(struct l2cap_chan *chan) chan->data = NULL; crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); + crypto_free_kpp(smp->tfm_ecdh); kzfree(smp); } @@ -3498,13 +3529,13 @@ static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) out[i] = __swab64(in[ndigits - 1 - i]); } -static int __init test_debug_key(void) +static int __init test_debug_key(struct crypto_kpp *tfm_ecdh) { u8 pk[64], sk[32]; swap_digits((u64 *)debug_sk, (u64 *)sk, 4); - if (!generate_ecdh_keys(pk, sk)) + if (!generate_ecdh_keys(tfm_ecdh, pk, sk)) return -EINVAL; if (crypto_memneq(sk, debug_sk, 32)) @@ -3763,7 +3794,8 @@ static const struct file_operations test_smp_fops = { }; static int __init run_selftests(struct crypto_cipher *tfm_aes, - struct crypto_shash *tfm_cmac) + struct crypto_shash *tfm_cmac, + struct crypto_kpp *tfm_ecdh) { ktime_t calltime, delta, rettime; unsigned long long duration; @@ -3771,7 +3803,7 @@ static int __init run_selftests(struct crypto_cipher *tfm_aes, calltime = ktime_get(); - err = test_debug_key(); + err = test_debug_key(tfm_ecdh); if (err) { BT_ERR("debug_key test failed"); goto done; @@ -3848,6 +3880,7 @@ int __init bt_selftest_smp(void) { struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; + struct crypto_kpp *tfm_ecdh; int err; tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); @@ -3863,10 +3896,19 @@ int __init bt_selftest_smp(void) return PTR_ERR(tfm_cmac); } - err = run_selftests(tfm_aes, tfm_cmac); + tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + if (IS_ERR(tfm_ecdh)) { + BT_ERR("Unable to create ECDH crypto context"); + crypto_free_shash(tfm_cmac); + crypto_free_cipher(tfm_aes); + return PTR_ERR(tfm_ecdh); + } + + err = run_selftests(tfm_aes, tfm_cmac, tfm_ecdh); crypto_free_shash(tfm_cmac); crypto_free_cipher(tfm_aes); + crypto_free_kpp(tfm_ecdh); return err; } -- cgit v1.2.3 From a297641610963bbb238ea77b32a2e958c4f7b184 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 28 Sep 2017 17:14:52 +0300 Subject: Bluetooth: ecdh_helper - reveal error codes ecdh_helper functions were hiding the error codes and chose to return the return value of an relational operator, "==". Remove the unnecessary query and reveal the error codes. While updating the return values, code in a way that compilers will warn in case of uninitialized err. Signed-off-by: Tudor Ambarus Signed-off-by: Marcel Holtmann --- net/bluetooth/ecdh_helper.c | 32 +++++++++++++++++++------------- net/bluetooth/ecdh_helper.h | 8 ++++---- net/bluetooth/smp.c | 17 ++++++++++------- 3 files changed, 33 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c index ac2c7087921d..22c8daa0b451 100644 --- a/net/bluetooth/ecdh_helper.c +++ b/net/bluetooth/ecdh_helper.c @@ -49,8 +49,8 @@ static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) out[i] = __swab64(in[ndigits - 1 - i]); } -bool compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], - const u8 private_key[32], u8 secret[32]) +int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], + const u8 private_key[32], u8 secret[32]) { struct kpp_request *req; struct ecdh p; @@ -58,15 +58,17 @@ bool compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], struct scatterlist src, dst; u8 *tmp, *buf; unsigned int buf_len; - int err = -ENOMEM; + int err; tmp = kmalloc(64, GFP_KERNEL); if (!tmp) - return false; + return -ENOMEM; req = kpp_request_alloc(tfm, GFP_KERNEL); - if (!req) + if (!req) { + err = -ENOMEM; goto free_tmp; + } init_completion(&result.completion); @@ -80,8 +82,10 @@ bool compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], p.curve_id = ECC_CURVE_NIST_P256; buf_len = crypto_ecdh_key_len(&p); buf = kmalloc(buf_len, GFP_KERNEL); - if (!buf) + if (!buf) { + err = -ENOMEM; goto free_req; + } crypto_ecdh_encode_key(buf, buf_len, &p); @@ -119,11 +123,11 @@ free_req: kpp_request_free(req); free_tmp: kfree(tmp); - return (err == 0); + return err; } -bool generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], - u8 private_key[32]) +int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], + u8 private_key[32]) { struct kpp_request *req; struct ecdh p; @@ -131,17 +135,19 @@ bool generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], struct scatterlist dst; u8 *tmp, *buf; unsigned int buf_len; - int err = -ENOMEM; + int err; const unsigned short max_tries = 16; unsigned short tries = 0; tmp = kmalloc(64, GFP_KERNEL); if (!tmp) - return false; + return -ENOMEM; req = kpp_request_alloc(tfm, GFP_KERNEL); - if (!req) + if (!req) { + err = -ENOMEM; goto free_tmp; + } init_completion(&result.completion); @@ -202,5 +208,5 @@ free_req: kpp_request_free(req); free_tmp: kfree(tmp); - return (err == 0); + return err; } diff --git a/net/bluetooth/ecdh_helper.h b/net/bluetooth/ecdh_helper.h index 5cde37d12fd9..50e6676aebf5 100644 --- a/net/bluetooth/ecdh_helper.h +++ b/net/bluetooth/ecdh_helper.h @@ -23,7 +23,7 @@ #include #include -bool compute_ecdh_secret(struct crypto_kpp *tfm, const u8 pub_a[64], - const u8 priv_b[32], u8 secret[32]); -bool generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], - u8 private_key[32]); +int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 pub_a[64], + const u8 priv_b[32], u8 secret[32]); +int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], + u8 private_key[32]); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 31b64bc3cf16..af7e6100e55b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -577,9 +577,10 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) get_random_bytes(smp->local_sk, 32); /* Generate local key pair for Secure Connections */ - if (!generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk, - smp->local_sk)) - return -EIO; + err = generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk, + smp->local_sk); + if (err) + return err; /* This is unlikely, but we need to check that * we didn't accidentially generate a debug key. @@ -1919,8 +1920,8 @@ static u8 sc_send_public_key(struct smp_chan *smp) get_random_bytes(smp->local_sk, 32); /* Generate local key pair for Secure Connections */ - if (!generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk, - smp->local_sk)) + if (generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk, + smp->local_sk)) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that @@ -3532,11 +3533,13 @@ static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) static int __init test_debug_key(struct crypto_kpp *tfm_ecdh) { u8 pk[64], sk[32]; + int err; swap_digits((u64 *)debug_sk, (u64 *)sk, 4); - if (!generate_ecdh_keys(tfm_ecdh, pk, sk)) - return -EINVAL; + err = generate_ecdh_keys(tfm_ecdh, pk, sk); + if (err) + return err; if (crypto_memneq(sk, debug_sk, 32)) return -EINVAL; -- cgit v1.2.3 From 3814baf3f2473226e479fc1f4f48d01de2ce0a7b Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 28 Sep 2017 17:14:53 +0300 Subject: Bluetooth: selftest - check for errors when computing ZZ Signed-off-by: Tudor Ambarus Signed-off-by: Marcel Holtmann --- net/bluetooth/selftest.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c index 126bdc5a77a7..ce99648ed870 100644 --- a/net/bluetooth/selftest.c +++ b/net/bluetooth/selftest.c @@ -143,7 +143,7 @@ static int __init test_ecdh_sample(struct crypto_kpp *tfm, const u8 priv_a[32], const u8 pub_b[64], const u8 dhkey[32]) { u8 *tmp, *dhkey_a, *dhkey_b; - int ret = 0; + int ret; tmp = kmalloc(64, GFP_KERNEL); if (!tmp) @@ -152,8 +152,13 @@ static int __init test_ecdh_sample(struct crypto_kpp *tfm, const u8 priv_a[32], dhkey_a = &tmp[0]; dhkey_b = &tmp[32]; - compute_ecdh_secret(tfm, pub_b, priv_a, dhkey_a); - compute_ecdh_secret(tfm, pub_a, priv_b, dhkey_b); + ret = compute_ecdh_secret(tfm, pub_b, priv_a, dhkey_a); + if (ret) + goto out; + + ret = compute_ecdh_secret(tfm, pub_a, priv_b, dhkey_b); + if (ret) + goto out; if (memcmp(dhkey_a, dhkey, 32)) { ret = -EINVAL; -- cgit v1.2.3 From 168ed65483a1777c2570f4c0a4a64e20a823cf25 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 28 Sep 2017 17:14:54 +0300 Subject: Bluetooth: ecdh_helper - fix leak of private key tmp buffer contains the swapped private key. In case the setkey call failed, the tmp buffer was freed without clearing the private key. Zeroize the temporary buffer so we don't leak the private key. Signed-off-by: Tudor Ambarus Signed-off-by: Marcel Holtmann --- net/bluetooth/ecdh_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c index 22c8daa0b451..16e022f5ab27 100644 --- a/net/bluetooth/ecdh_helper.c +++ b/net/bluetooth/ecdh_helper.c @@ -122,7 +122,7 @@ free_all: free_req: kpp_request_free(req); free_tmp: - kfree(tmp); + kzfree(tmp); return err; } -- cgit v1.2.3 From c0153b0b901a16663ff91504fea25fb51d57cc29 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 28 Sep 2017 17:14:55 +0300 Subject: Bluetooth: let the crypto subsystem generate the ecc privkey That Bluetooth SMP knows about the private key is pointless, since the detection of debug key usage is actually via the public key portion. With this patch, the Bluetooth SMP will stop keeping a copy of the ecdh private key and will let the crypto subsystem to generate and handle the ecdh private key, potentially benefiting of hardware ecc private key generation and retention. The loop that tries to generate a correct private key is now removed and we trust the crypto subsystem to generate a correct private key. This backup logic should be done in crypto, if really needed. Signed-off-by: Tudor Ambarus Signed-off-by: Marcel Holtmann --- net/bluetooth/ecdh_helper.c | 186 ++++++++++++++++++++++++-------------------- net/bluetooth/ecdh_helper.h | 9 ++- net/bluetooth/selftest.c | 14 +++- net/bluetooth/smp.c | 66 +++++++--------- 4 files changed, 147 insertions(+), 128 deletions(-) (limited to 'net') diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c index 16e022f5ab27..2155ce802877 100644 --- a/net/bluetooth/ecdh_helper.c +++ b/net/bluetooth/ecdh_helper.c @@ -49,15 +49,21 @@ static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) out[i] = __swab64(in[ndigits - 1 - i]); } +/* compute_ecdh_secret() - function assumes that the private key was + * already set. + * @tfm: KPP tfm handle allocated with crypto_alloc_kpp(). + * @public_key: pair's ecc public key. + * secret: memory where the ecdh computed shared secret will be saved. + * + * Return: zero on success; error code in case of error. + */ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], - const u8 private_key[32], u8 secret[32]) + u8 secret[32]) { struct kpp_request *req; - struct ecdh p; + u8 *tmp; struct ecdh_completion result; struct scatterlist src, dst; - u8 *tmp, *buf; - unsigned int buf_len; int err; tmp = kmalloc(64, GFP_KERNEL); @@ -72,28 +78,6 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], init_completion(&result.completion); - /* Security Manager Protocol holds digits in litte-endian order - * while ECC API expect big-endian data - */ - swap_digits((u64 *)private_key, (u64 *)tmp, 4); - p.key = (char *)tmp; - p.key_size = 32; - /* Set curve_id */ - p.curve_id = ECC_CURVE_NIST_P256; - buf_len = crypto_ecdh_key_len(&p); - buf = kmalloc(buf_len, GFP_KERNEL); - if (!buf) { - err = -ENOMEM; - goto free_req; - } - - crypto_ecdh_encode_key(buf, buf_len, &p); - - /* Set A private Key */ - err = crypto_kpp_set_secret(tfm, (void *)buf, buf_len); - if (err) - goto free_all; - swap_digits((u64 *)public_key, (u64 *)tmp, 4); /* x */ swap_digits((u64 *)&public_key[32], (u64 *)&tmp[32], 4); /* y */ @@ -118,26 +102,76 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], memcpy(secret, tmp, 32); free_all: - kzfree(buf); -free_req: kpp_request_free(req); free_tmp: kzfree(tmp); return err; } -int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], - u8 private_key[32]) +/* set_ecdh_privkey() - set or generate ecc private key. + * + * Function generates an ecc private key in the crypto subsystem when receiving + * a NULL private key or sets the received key when not NULL. + * + * @tfm: KPP tfm handle allocated with crypto_alloc_kpp(). + * @private_key: user's ecc private key. When not NULL, the key is expected + * in little endian format. + * + * Return: zero on success; error code in case of error. + */ +int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 private_key[32]) +{ + u8 *buf, *tmp = NULL; + unsigned int buf_len; + int err; + struct ecdh p = {0}; + + p.curve_id = ECC_CURVE_NIST_P256; + + if (private_key) { + tmp = kmalloc(32, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + swap_digits((u64 *)private_key, (u64 *)tmp, 4); + p.key = tmp; + p.key_size = 32; + } + + buf_len = crypto_ecdh_key_len(&p); + buf = kmalloc(buf_len, GFP_KERNEL); + if (!buf) { + err = -ENOMEM; + goto free_tmp; + } + + err = crypto_ecdh_encode_key(buf, buf_len, &p); + if (err) + goto free_all; + + err = crypto_kpp_set_secret(tfm, buf, buf_len); + /* fall through */ +free_all: + kzfree(buf); +free_tmp: + kzfree(tmp); + return err; +} + +/* generate_ecdh_public_key() - function assumes that the private key was + * already set. + * + * @tfm: KPP tfm handle allocated with crypto_alloc_kpp(). + * @public_key: memory where the computed ecc public key will be saved. + * + * Return: zero on success; error code in case of error. + */ +int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64]) { struct kpp_request *req; - struct ecdh p; + u8 *tmp; struct ecdh_completion result; struct scatterlist dst; - u8 *tmp, *buf; - unsigned int buf_len; int err; - const unsigned short max_tries = 16; - unsigned short tries = 0; tmp = kmalloc(64, GFP_KERNEL); if (!tmp) @@ -150,63 +184,47 @@ int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], } init_completion(&result.completion); + sg_init_one(&dst, tmp, 64); + kpp_request_set_input(req, NULL, 0); + kpp_request_set_output(req, &dst, 64); + kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + ecdh_complete, &result); - /* Set curve_id */ - p.curve_id = ECC_CURVE_NIST_P256; - p.key_size = 32; - buf_len = crypto_ecdh_key_len(&p); - buf = kmalloc(buf_len, GFP_KERNEL); - if (!buf) - goto free_req; - - do { - if (tries++ >= max_tries) - goto free_all; - - /* Set private Key */ - p.key = (char *)private_key; - crypto_ecdh_encode_key(buf, buf_len, &p); - err = crypto_kpp_set_secret(tfm, buf, buf_len); - if (err) - goto free_all; - - sg_init_one(&dst, tmp, 64); - kpp_request_set_input(req, NULL, 0); - kpp_request_set_output(req, &dst, 64); - kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - ecdh_complete, &result); - - err = crypto_kpp_generate_public_key(req); - - if (err == -EINPROGRESS) { - wait_for_completion(&result.completion); - err = result.err; - } - - /* Private key is not valid. Regenerate */ - if (err == -EINVAL) - continue; - - if (err < 0) - goto free_all; - else - break; - - } while (true); - - /* Keys are handed back in little endian as expected by Security - * Manager Protocol + err = crypto_kpp_generate_public_key(req); + if (err == -EINPROGRESS) { + wait_for_completion(&result.completion); + err = result.err; + } + if (err < 0) + goto free_all; + + /* The public key is handed back in little endian as expected by + * the Security Manager Protocol. */ swap_digits((u64 *)tmp, (u64 *)public_key, 4); /* x */ swap_digits((u64 *)&tmp[32], (u64 *)&public_key[32], 4); /* y */ - swap_digits((u64 *)private_key, (u64 *)tmp, 4); - memcpy(private_key, tmp, 32); free_all: - kzfree(buf); -free_req: kpp_request_free(req); free_tmp: kfree(tmp); return err; } + +/* generate_ecdh_keys() - generate ecc key pair. + * + * @tfm: KPP tfm handle allocated with crypto_alloc_kpp(). + * @public_key: memory where the computed ecc public key will be saved. + * + * Return: zero on success; error code in case of error. + */ +int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64]) +{ + int err; + + err = set_ecdh_privkey(tfm, NULL); + if (err) + return err; + + return generate_ecdh_public_key(tfm, public_key); +} diff --git a/net/bluetooth/ecdh_helper.h b/net/bluetooth/ecdh_helper.h index 50e6676aebf5..a6f8d03d4aaf 100644 --- a/net/bluetooth/ecdh_helper.h +++ b/net/bluetooth/ecdh_helper.h @@ -23,7 +23,8 @@ #include #include -int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 pub_a[64], - const u8 priv_b[32], u8 secret[32]); -int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64], - u8 private_key[32]); +int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 pair_public_key[64], + u8 secret[32]); +int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 *private_key); +int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64]); +int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64]); diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c index ce99648ed870..2d1519d0affa 100644 --- a/net/bluetooth/selftest.c +++ b/net/bluetooth/selftest.c @@ -152,11 +152,11 @@ static int __init test_ecdh_sample(struct crypto_kpp *tfm, const u8 priv_a[32], dhkey_a = &tmp[0]; dhkey_b = &tmp[32]; - ret = compute_ecdh_secret(tfm, pub_b, priv_a, dhkey_a); + ret = set_ecdh_privkey(tfm, priv_a); if (ret) goto out; - ret = compute_ecdh_secret(tfm, pub_a, priv_b, dhkey_b); + ret = compute_ecdh_secret(tfm, pub_b, dhkey_a); if (ret) goto out; @@ -165,9 +165,17 @@ static int __init test_ecdh_sample(struct crypto_kpp *tfm, const u8 priv_a[32], goto out; } + ret = set_ecdh_privkey(tfm, priv_b); + if (ret) + goto out; + + ret = compute_ecdh_secret(tfm, pub_a, dhkey_b); + if (ret) + goto out; + if (memcmp(dhkey_b, dhkey, 32)) ret = -EINVAL; - + /* fall through*/ out: kfree(tmp); return ret; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index af7e6100e55b..d41449b9e9d6 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -84,7 +84,6 @@ enum { struct smp_dev { /* Secure Connections OOB data */ u8 local_pk[64]; - u8 local_sk[32]; u8 local_rand[16]; bool debug_key; @@ -126,7 +125,6 @@ struct smp_chan { /* Secure Connections variables */ u8 local_pk[64]; - u8 local_sk[32]; u8 remote_pk[64]; u8 dhkey[32]; u8 mackey[16]; @@ -568,24 +566,22 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { BT_DBG("Using debug keys"); + err = set_ecdh_privkey(smp->tfm_ecdh, debug_sk); + if (err) + return err; memcpy(smp->local_pk, debug_pk, 64); - memcpy(smp->local_sk, debug_sk, 32); smp->debug_key = true; } else { while (true) { - /* Seed private key with random number */ - get_random_bytes(smp->local_sk, 32); - - /* Generate local key pair for Secure Connections */ - err = generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk, - smp->local_sk); + /* Generate key pair for Secure Connections */ + err = generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk); if (err) return err; /* This is unlikely, but we need to check that * we didn't accidentially generate a debug key. */ - if (crypto_memneq(smp->local_sk, debug_sk, 32)) + if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; } smp->debug_key = false; @@ -593,7 +589,6 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) SMP_DBG("OOB Public Key X: %32phN", smp->local_pk); SMP_DBG("OOB Public Key Y: %32phN", smp->local_pk + 32); - SMP_DBG("OOB Private Key: %32phN", smp->local_sk); get_random_bytes(smp->local_rand, 16); @@ -1900,7 +1895,6 @@ static u8 sc_send_public_key(struct smp_chan *smp) smp_dev = chan->data; memcpy(smp->local_pk, smp_dev->local_pk, 64); - memcpy(smp->local_sk, smp_dev->local_sk, 32); memcpy(smp->lr, smp_dev->local_rand, 16); if (smp_dev->debug_key) @@ -1911,23 +1905,20 @@ static u8 sc_send_public_key(struct smp_chan *smp) if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { BT_DBG("Using debug keys"); + if (set_ecdh_privkey(smp->tfm_ecdh, debug_sk)) + return SMP_UNSPECIFIED; memcpy(smp->local_pk, debug_pk, 64); - memcpy(smp->local_sk, debug_sk, 32); set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); } else { while (true) { - /* Seed private key with random number */ - get_random_bytes(smp->local_sk, 32); - - /* Generate local key pair for Secure Connections */ - if (generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk, - smp->local_sk)) + /* Generate key pair for Secure Connections */ + if (generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk)) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that * we didn't accidentially generate a debug key. */ - if (crypto_memneq(smp->local_sk, debug_sk, 32)) + if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; } } @@ -1935,7 +1926,6 @@ static u8 sc_send_public_key(struct smp_chan *smp) done: SMP_DBG("Local Public Key X: %32phN", smp->local_pk); SMP_DBG("Local Public Key Y: %32phN", smp->local_pk + 32); - SMP_DBG("Local Private Key: %32phN", smp->local_sk); smp_send_cmd(smp->conn, SMP_CMD_PUBLIC_KEY, 64, smp->local_pk); @@ -2663,6 +2653,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = hcon->hdev; + struct crypto_kpp *tfm_ecdh; struct smp_cmd_pairing_confirm cfm; int err; @@ -2695,8 +2686,18 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) SMP_DBG("Remote Public Key X: %32phN", smp->remote_pk); SMP_DBG("Remote Public Key Y: %32phN", smp->remote_pk + 32); - if (!compute_ecdh_secret(smp->tfm_ecdh, smp->remote_pk, smp->local_sk, - smp->dhkey)) + /* Compute the shared secret on the same crypto tfm on which the private + * key was set/generated. + */ + if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { + struct smp_dev *smp_dev = chan->data; + + tfm_ecdh = smp_dev->tfm_ecdh; + } else { + tfm_ecdh = smp->tfm_ecdh; + } + + if (compute_ecdh_secret(tfm_ecdh, smp->remote_pk, smp->dhkey)) return SMP_UNSPECIFIED; SMP_DBG("DHKey %32phN", smp->dhkey); @@ -3522,27 +3523,18 @@ void smp_unregister(struct hci_dev *hdev) #if IS_ENABLED(CONFIG_BT_SELFTEST_SMP) -static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) -{ - int i; - - for (i = 0; i < ndigits; i++) - out[i] = __swab64(in[ndigits - 1 - i]); -} - static int __init test_debug_key(struct crypto_kpp *tfm_ecdh) { - u8 pk[64], sk[32]; + u8 pk[64]; int err; - swap_digits((u64 *)debug_sk, (u64 *)sk, 4); - - err = generate_ecdh_keys(tfm_ecdh, pk, sk); + err = set_ecdh_privkey(tfm_ecdh, debug_sk); if (err) return err; - if (crypto_memneq(sk, debug_sk, 32)) - return -EINVAL; + err = generate_ecdh_public_key(tfm_ecdh, pk); + if (err) + return err; if (crypto_memneq(pk, debug_pk, 64)) return -EINVAL; -- cgit v1.2.3 From b49ef29d72bd2975b8b2cc84b1fa6b40aa2edd5f Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 6 Oct 2017 20:42:45 +0200 Subject: Bluetooth: Fix compiler warning with selftest duration calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CC net/bluetooth/selftest.o net/bluetooth/selftest.c: In function ‘bt_selftest_init’: net/bluetooth/selftest.c:246:3: warning: ‘duration’ may be used uninitialized in this function [-Wmaybe-uninitialized] snprintf(test_ecdh_buffer, sizeof(test_ecdh_buffer), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ "PASS (%llu usecs)\n", duration); ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/bluetooth/selftest.c:203:21: note: ‘duration’ was declared here unsigned long long duration; ^~~~~~~~ Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c index 2d1519d0affa..03e3c89c3046 100644 --- a/net/bluetooth/selftest.c +++ b/net/bluetooth/selftest.c @@ -200,7 +200,7 @@ static int __init test_ecdh(void) { struct crypto_kpp *tfm; ktime_t calltime, delta, rettime; - unsigned long long duration; + unsigned long long duration = 0; int err; calltime = ktime_get(); -- cgit v1.2.3 From 18a4c0eab2623cc95be98a1e6af1ad18e7695977 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:21 -0700 Subject: net: add rb_to_skb() and other rb tree helpers Geeralize private netem_rb_to_skb() TCP rtx queue will soon be converted to rb-tree, so we will need skb_rbtree_walk() helpers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 8 +++----- net/ipv4/tcp_input.c | 33 ++++++++++++--------------------- net/sched/sch_netem.c | 14 ++++---------- 3 files changed, 19 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 29fff14d5a53..7ee4aadcdd71 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -465,17 +465,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node *p; - struct sk_buff *skb; struct dst_entry *dst; + struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { - p = rb_first(&tp->out_of_order_queue); - if (p && !rb_next(p)) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); + if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb0d7ed84b94..90afe4143596 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4335,7 +4335,7 @@ static void tcp_ofo_queue(struct sock *sk) p = rb_first(&tp->out_of_order_queue); while (p) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = rb_to_skb(p); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4399,7 +4399,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node **p, *q, *parent; + struct rb_node **p, *parent; struct sk_buff *skb1; u32 seq, end_seq; bool fragstolen; @@ -4458,7 +4458,7 @@ coalesce_done: parent = NULL; while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb1)->seq)) { p = &parent->rb_left; continue; @@ -4503,9 +4503,7 @@ insert: merge_right: /* Remove other segments covered by skb. */ - while ((q = rb_next(&skb->rbnode)) != NULL) { - skb1 = rb_entry(q, struct sk_buff, rbnode); - + while ((skb1 = skb_rb_next(skb)) != NULL) { if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { @@ -4520,7 +4518,7 @@ merge_right: tcp_drop(sk, skb1); } /* If there is no skb after us, we are the last_skb ! */ - if (!q) + if (!skb1) tp->ooo_last_skb = skb; add_sack: @@ -4706,7 +4704,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li if (list) return !skb_queue_is_last(list, skb) ? skb->next : NULL; - return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); + return skb_rb_next(skb); } static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, @@ -4735,7 +4733,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) p = &parent->rb_left; else @@ -4854,26 +4852,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *head; - struct rb_node *p; u32 start, end; - p = rb_first(&tp->out_of_order_queue); - skb = rb_entry_safe(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); new_range: if (!skb) { - p = rb_last(&tp->out_of_order_queue); - /* Note: This is possible p is NULL here. We do not - * use rb_entry_safe(), as ooo_last_skb is valid only - * if rbtree is not empty. - */ - tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); + tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); return; } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; for (head = skb;;) { - skb = tcp_skb_next(skb, NULL); + skb = skb_rb_next(skb); /* Range is terminated when we see a gap or when * we are at the queue end. @@ -4916,14 +4907,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk) do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); - tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); + tcp_drop(sk, rb_to_skb(node)); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; node = prev; } while (node); - tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); + tp->ooo_last_skb = rb_to_skb(prev); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5a4f10080290..db0228a65e8c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -148,12 +148,6 @@ struct netem_skb_cb { psched_time_t time_to_send; }; - -static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) -{ - return rb_entry(rb, struct sk_buff, rbnode); -} - static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { /* we assume we can use skb next/prev/tstamp as storage for rb_node */ @@ -364,7 +358,7 @@ static void tfifo_reset(struct Qdisc *sch) struct rb_node *p = rb_first(&q->t_root); while (p) { - struct sk_buff *skb = netem_rb_to_skb(p); + struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); @@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) struct sk_buff *skb; parent = *p; - skb = netem_rb_to_skb(parent); + skb = rb_to_skb(parent); if (tnext >= netem_skb_cb(skb)->time_to_send) p = &parent->rb_right; else @@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *t_skb; struct netem_skb_cb *t_last; - t_skb = netem_rb_to_skb(rb_last(&q->t_root)); + t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || t_last->time_to_send > last->time_to_send) { @@ -617,7 +611,7 @@ deliver: if (p) { psched_time_t time_to_send; - skb = netem_rb_to_skb(p); + skb = rb_to_skb(p); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; -- cgit v1.2.3 From ac3f09ba3e496bd7cc780ead05b1d1bb5f33aedb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:22 -0700 Subject: tcp: uninline tcp_write_queue_purge() Since the upcoming rtx rbtree will add some extra code, it is time to not inline this fat function anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8cf742fd4f99..f8ebae62f834 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2318,6 +2318,20 @@ static inline bool tcp_need_reset(int state) TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } +void tcp_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); + sk_wmem_free_skb(sk, skb); + } + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); + sk_mem_reclaim(sk); + tcp_clear_all_retrans_hints(tcp_sk(sk)); +} + int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); -- cgit v1.2.3 From 4e8cc22803080853a33bafc6748e133448fb8182 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:23 -0700 Subject: tcp: tcp_tx_timestamp() cleanup tcp_write_queue_tail() call can be factorized. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f8ebae62f834..b8d379c80936 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -469,8 +469,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_init_buffer_space(sk); } -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { + struct sk_buff *skb = tcp_write_queue_tail(sk); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -1041,7 +1043,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sk->sk_tsflags); if (!(flags & MSG_SENDPAGE_NOTLAST)) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } @@ -1418,7 +1420,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: -- cgit v1.2.3 From 5e76ee4b8e90384936a214cde5b7816424f70232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:24 -0700 Subject: tcp: tcp_mark_head_lost() optimization It will be a bit more expensive to get the head of rtx queue once rtx queue is converted to an rb-tree. We can avoid this extra cost in case tp->lost_skb_hint is set. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 90afe4143596..abc3b1db81f8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2207,12 +2207,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); - if (tp->lost_skb_hint) { - skb = tp->lost_skb_hint; - cnt = tp->lost_cnt_hint; + skb = tp->lost_skb_hint; + if (skb) { /* Head already handled? */ - if (mark_head && skb != tcp_write_queue_head(sk)) + if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) return; + cnt = tp->lost_cnt_hint; } else { skb = tcp_write_queue_head(sk); cnt = 0; -- cgit v1.2.3 From 8ba6ddaaf86c4c6814774e4e4ef158b732bd9f9f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:25 -0700 Subject: tcp: reduce tcp_fastretrans_alert() verbosity With upcoming rb-tree implementation, the checks will trigger more often, and this is expected. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index abc3b1db81f8..be7644204cd4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2804,9 +2804,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) + if (!tp->sacked_out && tp->fackets_out) tp->fackets_out = 0; /* Now state machine starts. -- cgit v1.2.3 From f33198163a0fbb03766444253edf6ea50685d725 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:26 -0700 Subject: tcp: pass previous skb to tcp_shifted_skb() No need to recompute previous skb, as it will be a bit more expensive when rtx queue is converted to RB tree. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index be7644204cd4..72c4732ae2da 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1288,13 +1288,13 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ -static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, +static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *prev = tcp_write_queue_prev(sk, skb); u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ @@ -1495,7 +1495,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_shift(prev, skb, len)) goto fallback; - if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) + if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very @@ -1514,7 +1514,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, len = skb->len; if (skb_shift(prev, skb, len)) { pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), + len, mss, 0); } out: -- cgit v1.2.3 From 75c119afe14f74b4dd967d75ed9f57ab6c0ef045 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:27 -0700 Subject: tcp: implement rb-tree based retransmit queue Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 41 +++++++++++---- net/ipv4/tcp_input.c | 133 +++++++++++++++++++++++++----------------------- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 137 +++++++++++++++++++++++++++----------------------- net/ipv4/tcp_timer.c | 24 +++++---- 5 files changed, 193 insertions(+), 144 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b8d379c80936..3b34850d361f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; + sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsorted_sent_queue); @@ -701,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!tcp_send_head(sk)) - return; - skb = tcp_write_queue_tail(sk); + if (!skb) + return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); @@ -964,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int copy, i; bool can_coalesce; - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || + if (!skb || (copy = size_goal - skb->len) <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - skb_queue_empty(&sk->sk_write_queue)); + tcp_rtx_and_write_queues_empty(sk)); if (!skb) goto wait_for_memory; @@ -1199,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; + skb = tcp_write_queue_tail(sk); uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; @@ -1275,7 +1275,7 @@ restart: int max = size_goal; skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk)) { + if (skb) { if (skb->ip_summed == CHECKSUM_NONE) max = mss_now; copy = max - skb->len; @@ -1295,7 +1295,7 @@ new_segment: process_backlog = false; goto restart; } - first_skb = skb_queue_empty(&sk->sk_write_queue); + first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, select_size(sk, sg, first_skb), sk->sk_allocation, @@ -1521,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + return err; + copied += skb->len; + } + skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -2320,6 +2327,22 @@ static inline bool tcp_need_reset(int state) TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } +static void tcp_rtx_queue_purge(struct sock *sk) +{ + struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + /* Since we are deleting whole queue, no need to + * list_del(&skb->tcp_tsorted_anchor) + */ + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); + } +} + void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; @@ -2329,6 +2352,7 @@ void tcp_write_queue_purge(struct sock *sk) tcp_skb_tsorted_anchor_cleanup(skb); sk_wmem_free_skb(sk, skb); } + tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); @@ -2392,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags) * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; - tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(sk->sk_rx_dst); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 72c4732ae2da..d0682ce2a5d6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1142,6 +1142,7 @@ struct tcp_sacktag_state { u64 last_sackt; struct rate_sample *rate; int flag; + unsigned int mss_now; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1191,7 +1192,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, if (pkt_len >= skb->len && !in_sack) return 0; - err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); + err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1363,8 +1365,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_rtx_queue_unlink_and_free(skb, sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); @@ -1414,9 +1415,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto fallback; /* Can only happen with delayed DSACK + discard craziness */ - if (unlikely(skb == tcp_write_queue_head(sk))) + prev = skb_rb_prev(skb); + if (!prev) goto fallback; - prev = tcp_write_queue_prev(sk, skb); if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; @@ -1501,12 +1502,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, /* Hole filled allows collapsing with the next as well, this is very * useful when hole on every nth skb pattern happens */ - if (prev == tcp_write_queue_tail(sk)) + skb = skb_rb_next(prev); + if (!skb) goto out; - skb = tcp_write_queue_next(sk, prev); if (!skb_can_shift(skb) || - (skb == tcp_send_head(sk)) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || (mss != tcp_skb_seglen(skb))) goto out; @@ -1539,13 +1539,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { int in_sack = 0; bool dup_sack = dup_sack_in; - if (skb == tcp_send_head(sk)) - break; - /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; @@ -1607,23 +1604,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, return skb; } -/* Avoid all extra work that is being done by sacktag while walking in - * a normal way - */ +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, + struct tcp_sacktag_state *state, + u32 seq) +{ + struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; + struct sk_buff *skb; + int unack_bytes; + + while (*p) { + parent = *p; + skb = rb_to_skb(parent); + if (before(seq, TCP_SKB_CB(skb)->seq)) { + p = &parent->rb_left; + continue; + } + if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { + p = &parent->rb_right; + continue; + } + + state->fack_count = 0; + unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una; + if (state->mss_now && unack_bytes > 0) + state->fack_count = unack_bytes / state->mss_now; + + return skb; + } + return NULL; +} + static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, struct tcp_sacktag_state *state, u32 skip_to_seq) { - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - - if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) - break; + if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) + return skb; - state->fack_count += tcp_skb_pcount(skb); - } - return skb; + return tcp_sacktag_bsearch(sk, state, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, @@ -1745,8 +1763,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } } - skb = tcp_write_queue_head(sk); + state->mss_now = tcp_current_mss(sk); state->fack_count = 0; + skb = NULL; i = 0; if (!tp->sacked_out) { @@ -1970,7 +1989,7 @@ void tcp_enter_loss(struct sock *sk) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); @@ -1979,10 +1998,7 @@ void tcp_enter_loss(struct sock *sk) } tcp_clear_all_retrans_hints(tp); - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - + skb_rbtree_walk_from(skb) { mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || is_reneg); if (mark_lost) @@ -2215,13 +2231,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) return; cnt = tp->lost_cnt_hint; } else { - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); cnt = 0; } - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk_from(skb) { /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -2245,7 +2259,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) /* If needed, chop off the prefix to mark as lost. */ lost = (packets - oldcnt) * mss; if (lost < skb->len && - tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) + tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + lost, mss, GFP_ATOMIC) < 0) break; cnt = packets; } @@ -2329,7 +2344,7 @@ static bool tcp_any_retrans_done(const struct sock *sk) if (tp->retrans_out) return true; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) return true; @@ -2370,9 +2385,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (unmark_loss) { struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } tp->lost_out = 0; @@ -2617,9 +2630,7 @@ void tcp_simple_retransmit(struct sock *sk) unsigned int mss = tcp_current_mss(sk); u32 prior_lost = tp->lost_out; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss && !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { @@ -2713,7 +2724,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, * is updated in tcp_ack()). Otherwise fall back to * the conventional recovery. */ - if (tcp_send_head(sk) && + if (!tcp_write_queue_empty(sk) && after(tcp_wnd_end(tp), tp->snd_nxt)) { *rexmit = REXMIT_NEW; return; @@ -3077,11 +3088,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; + struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; long seq_rtt_us = -1L; long ca_rtt_us = -1L; - struct sk_buff *skb; u32 pkts_acked = 0; u32 last_in_flight = 0; bool rtt_update; @@ -3089,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = 0; - while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; @@ -3107,8 +3118,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, break; fully_acked = false; } else { - /* Speedup tcp_unlink_write_queue() and next loop */ - prefetchw(skb->next); acked_pcount = tcp_skb_pcount(skb); } @@ -3160,12 +3169,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!fully_acked) break; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; + tcp_rtx_queue_unlink_and_free(skb, sk); } if (!skb) @@ -3257,12 +3266,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, static void tcp_ack_probe(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *head = tcp_send_head(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Was it a usable window open? */ - - if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { + if (!head) + return; + if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). @@ -3382,7 +3393,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); - if (tcp_send_head(sk)) + if (!tcp_write_queue_empty(sk)) tcp_slow_start_after_idle_check(sk); if (nwin > tp->max_window) { @@ -3567,8 +3578,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; - /* We very likely will need to access write queue head. */ - prefetchw(sk->sk_write_queue.next); + /* We very likely will need to access rtx queue. */ + prefetch(sk->tcp_rtx_queue.rb_node); /* If the ack is older than previous acks * then we can probably ignore it. @@ -3682,8 +3693,7 @@ no_queue: * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (tcp_send_head(sk)) - tcp_ack_probe(sk); + tcp_ack_probe(sk); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -4726,7 +4736,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, } /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ -static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -5530,7 +5540,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; + struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; @@ -5565,9 +5575,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ - tcp_for_write_queue_from(data, sk) { - if (data == tcp_send_head(sk) || - __tcp_retransmit_skb(sk, data, 1)) + skb_rbtree_walk_from(data) { + if (__tcp_retransmit_skb(sk, data, 1)) break; } tcp_rearm_rto(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c7460fd90884..5418ecf03b78 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); BUG_ON(!skb); tcp_mstamp_refresh(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8162e2880178..696b0a168f16 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -66,15 +66,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_write_queue); + tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); + tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -1249,12 +1251,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) TCP_SKB_CB(skb)->eor = 0; } +/* Insert buff after skb on the write or rtx queue of sk. */ +static void tcp_insert_write_queue_after(struct sk_buff *skb, + struct sk_buff *buff, + struct sock *sk, + enum tcp_queue tcp_queue) +{ + if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) + __skb_queue_after(&sk->sk_write_queue, skb, buff); + else + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, +int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); @@ -1337,7 +1352,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; @@ -1625,10 +1640,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) - * 3) no more data to send (null tcp_send_head ) + * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ - if (!tcp_send_head(sk) && sk->sk_socket && + if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); @@ -1824,7 +1839,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, +static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { struct sk_buff *buff; @@ -1833,7 +1849,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now, gfp); + return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) @@ -1869,7 +1885,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); return 0; } @@ -1939,8 +1955,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, goto send_now; } - head = tcp_write_queue_head(sk); - + /* TODO : use tsorted_sent_queue ? */ + head = tcp_rtx_queue_head(sk); + if (!head) + goto send_now; age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) @@ -2158,13 +2176,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { - /* Always send the 1st or 2nd skb in write queue. + /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ - if (skb == sk->sk_write_queue.next || - skb->prev == sk->sk_write_queue.next) + if (tcp_rtx_queue_empty(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); @@ -2215,7 +2232,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ - if (tcp_write_queue_empty(sk)) + if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); @@ -2310,7 +2327,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, limit, mss_now, gfp))) break; if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) @@ -2350,7 +2368,7 @@ repair: tcp_cwnd_validate(sk, is_cwnd_limited); return false; } - return !tp->packets_out && tcp_send_head(sk); + return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk) @@ -2374,7 +2392,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - tcp_send_head(sk)) + !tcp_write_queue_empty(sk)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account @@ -2427,18 +2445,14 @@ void tcp_send_loss_probe(struct sock *sk) int mss = tcp_current_mss(sk); skb = tcp_send_head(sk); - if (skb) { - if (tcp_snd_wnd_test(tp, skb, mss)) { - pcount = tp->packets_out; - tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); - if (tp->packets_out > pcount) - goto probe_sent; - goto rearm_timer; - } - skb = tcp_write_queue_prev(sk, skb); - } else { - skb = tcp_write_queue_tail(sk); + if (skb && tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; } + skb = skb_rb_last(&sk->tcp_rtx_queue); /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) @@ -2456,10 +2470,11 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { - if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; - skb = tcp_write_queue_next(sk, skb); + skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) @@ -2659,7 +2674,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + struct sk_buff *next_skb = skb_rb_next(skb); int skb_size, next_skb_size; skb_size = skb->len; @@ -2676,8 +2691,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } tcp_highest_sack_combine(sk, next_skb, skb); - tcp_unlink_write_queue(next_skb, sk); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -2705,7 +2718,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_skb_collapse_tstamp(skb, next_skb); - sk_wmem_free_skb(sk, next_skb); + tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } @@ -2716,8 +2729,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) return false; if (skb_cloned(skb)) return false; - if (skb == tcp_send_head(sk)) - return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -2740,7 +2751,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; - tcp_for_write_queue_from_safe(skb, tmp, sk) { + skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; @@ -2815,7 +2826,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) len = cur_mss * segs; if (skb->len > len) { - if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, + cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone(skb, GFP_ATOMIC)) @@ -2906,29 +2918,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb, *rtx_head = NULL, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - struct sk_buff *hole = NULL; u32 max_segs; int mib_idx; if (!tp->packets_out) return; - if (tp->retransmit_skb_hint) { - skb = tp->retransmit_skb_hint; - } else { - skb = tcp_write_queue_head(sk); + skb = tp->retransmit_skb_hint; + if (!skb) { + rtx_head = tcp_rtx_queue_head(sk); + skb = rtx_head; } - max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { __u8 sacked; int segs; - if (skb == tcp_send_head(sk)) - break; - if (tcp_pacing_check(sk)) break; @@ -2973,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk) && + if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, @@ -3015,12 +3022,15 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { + if (!tskb && tcp_under_memory_pressure(sk)) + tskb = skb_rb_last(&sk->tcp_rtx_queue); + + if (tskb) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (!tcp_send_head(sk)) { + if (tcp_write_queue_empty(sk)) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have @@ -3086,9 +3096,9 @@ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { - pr_debug("%s: wrong queue state\n", __func__); + pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { @@ -3101,10 +3111,9 @@ int tcp_send_synack(struct sock *sk) if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); - tcp_unlink_write_queue(skb, sk); + tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); - __tcp_add_write_queue_head(sk, nskb); - sk_wmem_free_skb(sk, skb); + tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk->sk_wmem_queued += nskb->truesize; sk_mem_charge(sk, nskb->truesize); skb = nskb; @@ -3327,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); tp->write_seq = tcb->end_seq; @@ -3405,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); + tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } - /* data was not sent, this is our new send_head */ - sk->sk_send_head = syn_data; + /* data was not sent, put it in write_queue */ + __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: @@ -3453,6 +3462,7 @@ int tcp_connect(struct sock *sk) tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : @@ -3647,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); @@ -3677,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 655dd8d7f064..7014cc00c74c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) - start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return false; + start_ts = tcp_skb_timestamp(head); + } if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data) static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; u32 start_ts; - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; return; } @@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + start_ts = tcp_skb_timestamp(skb); if (!start_ts) - tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > jiffies_to_msecs(icsk->icsk_user_timeout)) @@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - WARN_ON(tcp_write_queue_empty(sk)); + WARN_ON(tcp_rtx_queue_empty(sk)); tp->tlp_high_seq = 0; @@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); + tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { + if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || tcp_send_head(sk)) + if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); -- cgit v1.2.3 From 180ca444b985c42948fa26abd278e616b5ce7eb2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:56 -0700 Subject: ipv6: introduce a new function fib6_update_sernum() This function takes a route as input and tries to update the sernum in the fib6_node this route is associated with. It will be used in later commit when adding a cached route into the exception table under that route. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e5308d7cbd75..0ba4fbb2f855 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -110,6 +110,20 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; +void fib6_update_sernum(struct rt6_info *rt) +{ + struct fib6_table *table = rt->rt6i_table; + struct net *net = dev_net(rt->dst.dev); + struct fib6_node *fn; + + write_lock_bh(&table->tb6_lock); + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&table->tb6_lock)); + if (fn) + fn->fn_sernum = fib6_new_sernum(net); + write_unlock_bh(&table->tb6_lock); +} + /* * Auxiliary address test functions for the radix tree. * -- cgit v1.2.3 From 35732d01fe311ec13c4e42936878b782b8e7ea85 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:57 -0700 Subject: ipv6: introduce a hash table to store dst cache Add a hash table into struct rt6_info in order to store dst caches created by pmtu discovery and ip redirect in ipv6 routing code. APIs to add dst cache, delete dst cache, find dst cache and update dst cache in the hash table are implemented and will be used in later commits. This is a preparation work to move all cache routes into the exception table instead of getting inserted into the fib6 tree. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 341 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 341 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 26cc9f483b6d..dc5e70975966 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,9 @@ static int rt6_fill_node(struct net *net, struct in6_addr *dst, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, @@ -392,6 +396,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_exception_bucket *bucket; struct dst_entry *from = dst->from; struct inet6_dev *idev; @@ -404,6 +409,11 @@ static void ip6_dst_destroy(struct dst_entry *dst) rt->rt6i_idev = NULL; in6_dev_put(idev); } + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); + if (bucket) { + rt->rt6i_exception_bucket = NULL; + kfree(bucket); + } dst->from = NULL; dst_release(from); @@ -1091,6 +1101,337 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return pcpu_rt; } +/* exception hash table implementation + */ +static DEFINE_SPINLOCK(rt6_exception_lock); + +/* Remove rt6_ex from hash table and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_remove_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex) +{ + if (!bucket || !rt6_ex) + return; + rt6_ex->rt6i->rt6i_node = NULL; + hlist_del_rcu(&rt6_ex->hlist); + rt6_release(rt6_ex->rt6i); + kfree_rcu(rt6_ex, rcu); + WARN_ON_ONCE(!bucket->depth); + bucket->depth--; +} + +/* Remove oldest rt6_ex in bucket and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) +{ + struct rt6_exception *rt6_ex, *oldest = NULL; + + if (!bucket) + return; + + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) + oldest = rt6_ex; + } + rt6_remove_exception(bucket, oldest); +} + +static u32 rt6_exception_hash(const struct in6_addr *dst, + const struct in6_addr *src) +{ + static u32 seed __read_mostly; + u32 val; + + net_get_random_once(&seed, sizeof(seed)); + val = jhash(dst, sizeof(*dst), seed); + +#ifdef CONFIG_IPV6_SUBTREES + if (src) + val = jhash(src, sizeof(*src), val); +#endif + return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rt6_exception_lock + */ +static struct rt6_exception * +__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rcu_read_lock() + */ +static struct rt6_exception * +__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +static int rt6_insert_exception(struct rt6_info *nrt, + struct rt6_info *ort) +{ + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err = 0; + + /* ort can't be a cache or pcpu route */ + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) + ort = (struct rt6_info *)ort->dst.from; + WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); + + spin_lock_bh(&rt6_exception_lock); + + if (ort->exception_bucket_flushed) { + err = -EINVAL; + goto out; + } + + bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) { + bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), + GFP_ATOMIC); + if (!bucket) { + err = -ENOMEM; + goto out; + } + rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); + } + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates ort is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (ort->rt6i_src.plen) + src_key = &nrt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_remove_exception(bucket, rt6_ex); + + rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); + if (!rt6_ex) { + err = -ENOMEM; + goto out; + } + rt6_ex->rt6i = nrt; + rt6_ex->stamp = jiffies; + atomic_inc(&nrt->rt6i_ref); + nrt->rt6i_node = ort->rt6i_node; + hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); + bucket->depth++; + + if (bucket->depth > FIB6_MAX_DEPTH) + rt6_exception_remove_oldest(bucket); + +out: + spin_unlock_bh(&rt6_exception_lock); + + /* Update fn->fn_sernum to invalidate all cached dst */ + if (!err) + fib6_update_sernum(ort); + + return err; +} + +void rt6_flush_exceptions(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + spin_lock_bh(&rt6_exception_lock); + /* Prevent rt6_insert_exception() to recreate the bucket list */ + rt->exception_bucket_flushed = 1; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) + goto out; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) + rt6_remove_exception(bucket, rt6_ex); + WARN_ON_ONCE(bucket->depth); + bucket++; + } + +out: + spin_unlock_bh(&rt6_exception_lock); +} + +/* Find cached rt in the hash table inside passed in rt + * Caller has to hold rcu_read_lock() + */ +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + struct rt6_info *res = NULL; + + bucket = rcu_dereference(rt->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates rt is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (rt->rt6i_src.plen) + src_key = saddr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); + + if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) + res = rt6_ex->rt6i; + + return res; +} + +/* Remove the passed in cached rt from the hash table that contains it */ +int rt6_remove_exception_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err; + + if (!from || + !(rt->rt6i_flags | RTF_CACHE)) + return -EINVAL; + + if (!rcu_access_pointer(from->rt6i_exception_bucket)) + return -ENOENT; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(from->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_spinlock(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) { + rt6_remove_exception(bucket, rt6_ex); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&rt6_exception_lock); + return err; +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + + if (!from || + !(rt->rt6i_flags | RTF_CACHE)) + return; + + rcu_read_lock(); + bucket = rcu_dereference(from->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_ex->stamp = jiffies; + + rcu_read_unlock(); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { -- cgit v1.2.3 From 60006a4825f9e71adf770745e17790c6e6c97a89 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:58 -0700 Subject: ipv6: prepare fib6_remove_prefsrc() for exception table After we move cached dst entries into the exception table under its parent route, current fib6_remove_prefsrc() no longer can access them. This commit makes fib6_remove_prefsrc() also go through all routes in the exception table to remove the pref src. This is a preparation patch in order to move all cached dst into the exception table. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dc5e70975966..f52ac57dcc99 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1264,6 +1264,12 @@ static int rt6_insert_exception(struct rt6_info *nrt, if (ort->rt6i_src.plen) src_key = &nrt->rt6i_src.addr; #endif + + /* Update rt6i_prefsrc as it could be changed + * in rt6_remove_prefsrc() + */ + nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); if (rt6_ex) @@ -1432,6 +1438,25 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) rcu_read_unlock(); } +static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + rt6_ex->rt6i->rt6i_prefsrc.plen = 0; + } + bucket++; + } + } +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -3159,8 +3184,12 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) if (((void *)rt->dst.dev == dev || !dev) && rt != net->ipv6.ip6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->rt6i_prefsrc.plen = 0; + /* need to update cache as well */ + rt6_exceptions_remove_prefsrc(rt); + spin_unlock_bh(&rt6_exception_lock); } return 0; } -- cgit v1.2.3 From f5bbe7ee79c2842ca69f25afd0b6b65a9011b735 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:59 -0700 Subject: ipv6: prepare rt6_mtu_change() for exception table If we move all cached dst into the exception table under the main route, current rt6_mtu_change() will no longer be able to access them. This commit makes rt6_mtu_change_route() function to also go through all cached routes in the exception table under the main route and do proper updates on the mtu. This is a preparation in order to move all cached routes into the exception table. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f52ac57dcc99..d9805a857809 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1269,6 +1269,14 @@ static int rt6_insert_exception(struct rt6_info *nrt, * in rt6_remove_prefsrc() */ nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + /* rt6_mtu_change() might lower mtu on ort. + * Only insert this exception route if its mtu + * is less than ort's mtu value. + */ + if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { + err = -EINVAL; + goto out; + } rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); @@ -1457,6 +1465,32 @@ static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) } } +static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) + entry->rt6i_pmtu = mtu; + } + bucket++; + } + } +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -3296,6 +3330,10 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) if (rt->dst.dev == arg->dev && dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { + spin_lock_bh(&rt6_exception_lock); + /* This case will be removed once the exception table + * is hooked up. + */ if (rt->rt6i_flags & RTF_CACHE) { /* For RTF_CACHE with rt6i_pmtu == 0 * (i.e. a redirected route), @@ -3309,6 +3347,8 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) dst_mtu(&rt->dst) == idev->cnf.mtu6)) { dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } + rt6_exceptions_update_pmtu(rt, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); } return 0; } -- cgit v1.2.3 From b16cb459d77800dcb886b5e73e1beafd3d596897 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:00 -0700 Subject: ipv6: prepare rt6_clean_tohost() for exception table If we move all cached dst into the exception table under the main route, current rt6_clean_tohost() will no longer be able to access them. This commit makes fib6_clean_tohost() to also go through all cached routes in exception table and removes cached gateway routes to the passed in gateway. This is a preparation in order to move all cached routes into the exception table. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d9805a857809..e8e901589564 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1491,6 +1491,43 @@ static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) } } +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +static void rt6_exceptions_clean_tohost(struct rt6_info *rt, + struct in6_addr *gateway) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + + if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == + RTF_CACHE_GATEWAY && + ipv6_addr_equal(gateway, + &entry->rt6i_gateway)) { + rt6_remove_exception(bucket, rt6_ex); + } + } + bucket++; + } + } + + spin_unlock_bh(&rt6_exception_lock); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -3240,18 +3277,27 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) } #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) -#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + /* RTF_CACHE_GATEWAY case will be removed once the exception + * table is hooked up to store all cached routes. + */ if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { return -1; } + + /* Further clean up cached routes in exception table. + * This is needed because cached route may have a different + * gateway than its 'parent' in the case of an ip redirect. + */ + rt6_exceptions_clean_tohost(rt, gateway); + return 0; } -- cgit v1.2.3 From c757faa8bfa26a0dd24b41ff783e0da042156887 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:01 -0700 Subject: ipv6: prepare fib6_age() for exception table If all dst cache entries are stored in the exception table under the main route, we have to go through them during fib6_age() when doing garbage collecting. Introduce a new function rt6_age_exception() which goes through all dst entries in the exception table and remove those entries that are expired. This function is called in fib6_age() so that all dst caches are also garbage collected. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 26 ++++++++--------------- net/ipv6/route.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0ba4fbb2f855..3afbe50f2779 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -38,14 +38,6 @@ #include #include -#define RT6_DEBUG 2 - -#if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) pr_debug(x) -#else -#define RT6_TRACE(x...) do { ; } while (0) -#endif - static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { @@ -1890,12 +1882,6 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -struct fib6_gc_args -{ - int timeout; - int more; -}; - static int fib6_age(struct rt6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; @@ -1904,9 +1890,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) /* * check addrconf expiration here. * Routes are expired even if they are in use. - * - * Also age clones. Note, that clones are aged out - * only if they are not in use now. */ if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { @@ -1915,6 +1898,9 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } gc_args->more++; + /* The following part will soon be removed when the exception + * table is hooked up to store all cached routes. + */ } else if (rt->rt6i_flags & RTF_CACHE) { if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -1940,6 +1926,12 @@ static int fib6_age(struct rt6_info *rt, void *arg) gc_args->more++; } + /* Also age clones in the exception table. + * Note, that clones are aged out + * only if they are not in use now. + */ + rt6_age_exceptions(rt, gc_args, now); + return 0; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e8e901589564..d2dd55f58b5d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1528,6 +1528,66 @@ static void rt6_exceptions_clean_tohost(struct rt6_info *rt, spin_unlock_bh(&rt6_exception_lock); } +static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_info *rt = rt6_ex->rt6i; + + if (atomic_read(&rt->dst.__refcnt) == 1 && + time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + RT6_TRACE("aging clone %p\n", rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } else if (rt->rt6i_flags & RTF_GATEWAY) { + struct neighbour *neigh; + __u8 neigh_flags = 0; + + neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); + if (neigh) { + neigh_flags = neigh->flags; + neigh_release(neigh); + } + if (!(neigh_flags & NTF_ROUTER)) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } + } + gc_args->more++; +} + +void rt6_age_exceptions(struct rt6_info *rt, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + rt6_age_examine_exception(bucket, rt6_ex, + gc_args, now); + } + bucket++; + } + } + spin_unlock_bh(&rt6_exception_lock); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { -- cgit v1.2.3 From 38fbeeeeccdb38d0635398e8e344d245f6d8dc52 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:02 -0700 Subject: ipv6: prepare fib6_locate() for exception table fib6_locate() is used to find the fib6_node according to the passed in prefix address key. It currently tries to find the fib6_node with the exact match of the passed in key. However, when we move cached routes into the exception table, fib6_locate() will fail to find the fib6_node for it as the cached routes will be stored in the exception table under the fib6_node with the longest prefix match of the cache's dst addr key. This commit adds a new parameter to let the caller specify if it needs exact match or longest prefix match. Right now, all callers still does exact match when calling fib6_locate(). It will be changed in later commit where exception table is hooked up to store cached routes. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- net/ipv6/ip6_fib.c | 30 +++++++++++++++++++++++------- net/ipv6/route.c | 5 +++-- 3 files changed, 27 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 837418ff2d4b..3ccaf52824c9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2322,7 +2322,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, return NULL; read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); + fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true); if (!fn) goto out; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 3afbe50f2779..b3e4cf0962f8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1343,14 +1343,21 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) + * exact_match == true means we try to find fn with exact match of + * the passed in prefix addr + * exact_match == false means we try to find fn with longest prefix + * match of the passed in prefix addr. This is useful for finding fn + * for cached route as it will be stored in the exception table under + * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, - int plen, int offset) + int plen, int offset, + bool exact_match) { - struct fib6_node *fn; + struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); @@ -1360,11 +1367,13 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) - return NULL; + goto out; if (plen == fn->fn_bit) return fn; + prev = fn; + /* * We have more bits to go */ @@ -1373,24 +1382,31 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, else fn = fn->left; } - return NULL; +out: + if (exact_match) + return NULL; + else + return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len) + const struct in6_addr *saddr, int src_len, + bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct rt6_info, rt6i_dst)); + offsetof(struct rt6_info, rt6i_dst), + exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn && fn->subtree) fn = fib6_locate_1(fn->subtree, saddr, src_len, - offsetof(struct rt6_info, rt6i_src)); + offsetof(struct rt6_info, rt6i_src), + exact_match); } #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d2dd55f58b5d..855b4ceec349 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2800,7 +2800,8 @@ static int ip6_route_del(struct fib6_config *cfg, fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, - &cfg->fc_src, cfg->fc_src_len); + &cfg->fc_src, cfg->fc_src_len, + true); if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { @@ -3009,7 +3010,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, return NULL; read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); + fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; -- cgit v1.2.3 From 2b760fcf5cfb34e8610df56d83745b2b74ae1379 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:03 -0700 Subject: ipv6: hook up exception table to store dst cache This commit makes use of the exception hash table implementation to store dst caches created by pmtu discovery and ip redirect into the hash table under the rt_info and no longer inserts these routes into fib6 tree. This makes the fib6 tree only contain static configured routes and could now be protected by rcu instead of a rw lock. With this change, in the route lookup related functions, after finding the rt6_info with the longest prefix, we also need to search for the exception table before doing backtracking. In the route delete function, if the route being deleted is not a dst cache, deletion of this route also need to flush the whole hash table under it. If it is a dst cache, then only delete the cached dst in the hash table. Note: for fib6_walk_continue() function, w->root now is always pointing to a root node considering that fib6_prune_clones() is removed from the code. So we add a WARN_ON() msg to make sure w->root always points to a root node and also removed the update of w->root in fib6_repair_tree(). This is a prerequisite for later patch because we don't need to make w->root as rcu protected when replacing rwlock with RCU. Also, we remove all prune related variables as it is no longer used. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 1 - net/ipv6/ip6_fib.c | 95 +++++++++------------------------------------ net/ipv6/route.c | 108 ++++++++++++++++++++++++++-------------------------- 3 files changed, 72 insertions(+), 132 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3ccaf52824c9..873afafddfc4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2326,7 +2326,6 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, if (!fn) goto out; - noflags |= RTF_CACHE; for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (rt->dst.dev->ifindex != dev->ifindex) continue; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b3e4cf0962f8..9c8e704e6af7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -54,7 +54,6 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); @@ -1101,6 +1100,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) return -EINVAL; + if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE)) + return -EINVAL; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1192,11 +1193,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, #endif err = fib6_add_rt2node(fn, rt, info, mxc); - if (!err) { + if (!err) fib6_start_gc(info->nl_net, rt); - if (!(rt->rt6i_flags & RTF_CACHE)) - fib6_prune_clones(info->nl_net, pn); - } out: if (err) { @@ -1511,19 +1509,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { - if (w->root == fn) { - w->root = w->node = NULL; - RT6_TRACE("W %p adjusted by delroot 1\n", w); - } else if (w->node == fn) { + if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { - if (w->root == fn) { - w->root = child; - RT6_TRACE("W %p adjusted by delroot 2\n", w); - } if (w->node == fn) { w->node = child; if (children&2) { @@ -1557,12 +1548,17 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, RT6_TRACE("fib6_del_route\n"); + WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); + /* Unlink it */ *rtp = rt->dst.rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; + /* Flush all cached dst in exception table */ + rt6_flush_exceptions(rt); + /* Reset round-robin state, if necessary */ if (fn->rr_ptr == rt) fn->rr_ptr = NULL; @@ -1625,18 +1621,9 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - if (!(rt->rt6i_flags & RTF_CACHE)) { - struct fib6_node *pn = fn; -#ifdef CONFIG_IPV6_SUBTREES - /* clones of this route might be in another subtree */ - if (rt->rt6i_src.plen) { - while (!(pn->fn_flags & RTN_ROOT)) - pn = pn->parent; - pn = pn->parent; - } -#endif - fib6_prune_clones(info->nl_net, pn); - } + /* remove cached dst from exception table */ + if (rt->rt6i_flags & RTF_CACHE) + return rt6_remove_exception_rt(rt); /* * Walk the leaf entries looking for ourself @@ -1679,16 +1666,14 @@ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn; + /* w->root should always be table->tb6_root */ + WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); + for (;;) { fn = w->node; if (!fn) return 0; - if (w->prune && fn != w->root && - fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { - w->state = FWS_C; - w->leaf = fn->leaf; - } switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: @@ -1820,20 +1805,16 @@ static int fib6_clean_node(struct fib6_walker *w) * func is called on each route. * It may return -1 -> delete this route. * 0 -> continue walking - * - * prune==1 -> only immediate children of node (certainly, - * ignoring pure split nodes) will be scanned. */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), - bool prune, int sernum, void *arg) + int sernum, void *arg) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; - c.w.prune = prune; c.w.count = 0; c.w.skip = 0; c.func = func; @@ -1858,7 +1839,7 @@ static void __fib6_clean_all(struct net *net, hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, false, sernum, arg); + func, sernum, arg); write_unlock_bh(&table->tb6_lock); } } @@ -1871,22 +1852,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); } -static int fib6_prune_clone(struct rt6_info *rt, void *arg) -{ - if (rt->rt6i_flags & RTF_CACHE) { - RT6_TRACE("pruning clone %p\n", rt); - return -1; - } - - return 0; -} - -static void fib6_prune_clones(struct net *net, struct fib6_node *fn) -{ - fib6_clean_tree(net, fn, fib6_prune_clone, true, - FIB6_NO_SERNUM_CHANGE, NULL); -} - static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); @@ -1914,32 +1879,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } gc_args->more++; - /* The following part will soon be removed when the exception - * table is hooked up to store all cached routes. - */ - } else if (rt->rt6i_flags & RTF_CACHE) { - if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) - rt->dst.obsolete = DST_OBSOLETE_KILL; - if (atomic_read(&rt->dst.__refcnt) == 1 && - rt->dst.obsolete == DST_OBSOLETE_KILL) { - RT6_TRACE("aging clone %p\n", rt); - return -1; - } else if (rt->rt6i_flags & RTF_GATEWAY) { - struct neighbour *neigh; - __u8 neigh_flags = 0; - - neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); - if (neigh) { - neigh_flags = neigh->flags; - neigh_release(neigh); - } - if (!(neigh_flags & NTF_ROUTER)) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); - return -1; - } - } - gc_args->more++; } /* Also age clones in the exception table. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 855b4ceec349..65130dde276a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -878,8 +878,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) { + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; - struct rt6_info *rt; read_lock_bh(&table->tb6_lock); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); @@ -893,6 +893,11 @@ restart: if (fn) goto restart; } + /* Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; + dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); @@ -1592,7 +1597,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; @@ -1624,6 +1629,10 @@ redo_rt6_select: } } + /*Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { dst_use(&rt->dst, jiffies); @@ -1988,23 +1997,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); + /* update rt6_ex->stamp for cache */ + if (rt6->rt6i_flags & RTF_CACHE) + rt6_update_exception_stamp_rt(rt6); } else if (daddr) { struct rt6_info *nrt6; nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - - /* ip6_ins_rt(nrt6) will bump the - * rt6->rt6i_node->fn_sernum - * which will fail the next rt6_check() and - * invalidate the sk->sk_dst_cache. - */ - ip6_ins_rt(nrt6); - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt6->dst); + if (rt6_insert_exception(nrt6, rt6)) + dst_release_immediate(&nrt6->dst); } } } @@ -2068,7 +2071,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -2093,8 +2096,23 @@ restart: continue; if (fl6->flowi6_oif != rt->dst.dev->ifindex) continue; - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + /* rt_cache's gateway might be different from its 'parent' + * in the case of an ip redirect. + * So we keep searching in the exception table if the gateway + * is different. + */ + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { + rt_cache = rt6_find_cached_rt(rt, + &fl6->daddr, + &fl6->saddr); + if (rt_cache && + ipv6_addr_equal(&rdfl->gateway, + &rt_cache->rt6i_gateway)) { + rt = rt_cache; + break; + } continue; + } break; } @@ -2785,9 +2803,9 @@ out_put: static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { + struct rt6_info *rt, *rt_cache; struct fib6_table *table; struct fib6_node *fn; - struct rt6_info *rt; int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); @@ -2801,13 +2819,17 @@ static int ip6_route_del(struct fib6_config *cfg, fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, &cfg->fc_src, cfg->fc_src_len, - true); + !(cfg->fc_flags & RTF_CACHE)); if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { - if ((rt->rt6i_flags & RTF_CACHE) && - !(cfg->fc_flags & RTF_CACHE)) - continue; + if (cfg->fc_flags & RTF_CACHE) { + rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, + &cfg->fc_src); + if (!rt_cache) + continue; + rt = rt_cache; + } if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -2933,8 +2955,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; - if (ip6_ins_rt(nrt)) - goto out_release; + /* No need to remove rt from the exception table if rt is + * a cached route because rt6_insert_exception() will + * takes care of it + */ + if (rt6_insert_exception(nrt, rt)) { + dst_release_immediate(&nrt->dst); + goto out; + } netevent.old = &rt->dst; netevent.new = &nrt->dst; @@ -2942,17 +2970,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); - if (rt->rt6i_flags & RTF_CACHE) { - rt = (struct rt6_info *) dst_clone(&rt->dst); - ip6_del_rt(rt); - } - -out_release: - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt->dst); - out: neigh_release(neigh); } @@ -3344,12 +3361,8 @@ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; - /* RTF_CACHE_GATEWAY case will be removed once the exception - * table is hooked up to store all cached routes. - */ - if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || - ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && - ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { return -1; } @@ -3438,20 +3451,9 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { spin_lock_bh(&rt6_exception_lock); - /* This case will be removed once the exception table - * is hooked up. - */ - if (rt->rt6i_flags & RTF_CACHE) { - /* For RTF_CACHE with rt6i_pmtu == 0 - * (i.e. a redirected route), - * the metrics of its rt->dst.from has already - * been updated. - */ - if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) - rt->rt6i_pmtu = arg->mtu; - } else if (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } rt6_exceptions_update_pmtu(rt, arg->mtu); -- cgit v1.2.3 From a94b9367e044ba672c9f4105eb1516ff6ff4948a Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:04 -0700 Subject: ipv6: grab rt->rt6i_ref before allocating pcpu rt After rwlock is replaced with rcu and spinlock, ip6_pol_route() will be called with only rcu held. That means rt6 route deletion could happen simultaneously with rt6_make_pcpu_rt(). This could potentially cause memory leak if rt6_release() is called right before rt6_make_pcpu_rt() on the same route. This patch grabs rt->rt6i_ref safely before calling rt6_make_pcpu_rt() to make sure rt6_release() will not get triggered while rt6_make_pcpu_rt() is in progress. And rt6_release() is called after rt6_make_pcpu_rt() is finished. Note: As we are incrementing rt->rt6i_ref in ip6_pol_route(), there is a very slim chance that fib6_purge_rt() will be triggered unnecessarily when deleting a route if ip6_pol_route() running on another thread picks this route as well and tries to make pcpu cache for it. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 58 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 65130dde276a..941c062389d2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1070,7 +1070,6 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) { - struct fib6_table *table = rt->rt6i_table; struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); @@ -1081,28 +1080,20 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return net->ipv6.ip6_null_entry; } - read_lock_bh(&table->tb6_lock); - if (rt->rt6i_pcpu) { - p = this_cpu_ptr(rt->rt6i_pcpu); - prev = cmpxchg(p, NULL, pcpu_rt); - if (prev) { - /* If someone did it before us, return prev instead */ - dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = prev; - } - } else { - /* rt has been removed from the fib6 tree - * before we have a chance to acquire the read_lock. - * In this case, don't brother to create a pcpu rt - * since rt is going away anyway. The next - * dst_check() will trigger a re-lookup. - */ + dst_hold(&pcpu_rt->dst); + p = this_cpu_ptr(rt->rt6i_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + /* release refcnt taken by ip6_rt_pcpu_alloc() */ + dst_release_immediate(&pcpu_rt->dst); + /* release refcnt taken by above dst_hold() */ dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = rt; + dst_hold(&prev->dst); + pcpu_rt = prev; } - dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); - read_unlock_bh(&table->tb6_lock); return pcpu_rt; } @@ -1683,19 +1674,28 @@ redo_rt6_select: if (pcpu_rt) { read_unlock_bh(&table->tb6_lock); } else { - /* We have to do the read_unlock first - * because rt6_make_pcpu_route() may trigger - * ip6_dst_gc() which will take the write_lock. - */ - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - pcpu_rt = rt6_make_pcpu_route(rt); - dst_release(&rt->dst); + /* atomic_inc_not_zero() is needed when using rcu */ + if (atomic_inc_not_zero(&rt->rt6i_ref)) { + /* We have to do the read_unlock first + * because rt6_make_pcpu_route() may trigger + * ip6_dst_gc() which will take the write_lock. + * + * No dst_hold() on rt is needed because grabbing + * rt->rt6i_ref makes sure rt can't be released. + */ + read_unlock_bh(&table->tb6_lock); + pcpu_rt = rt6_make_pcpu_route(rt); + rt6_release(rt); + } else { + /* rt is already removed from tree */ + read_unlock_bh(&table->tb6_lock); + pcpu_rt = net->ipv6.ip6_null_entry; + dst_hold(&pcpu_rt->dst); + } } trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); return pcpu_rt; - } } EXPORT_SYMBOL_GPL(ip6_pol_route); -- cgit v1.2.3 From 51e398e86d61b69b9a4be49ff7f6afeb87530df1 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:05 -0700 Subject: ipv6: don't release rt->rt6i_pcpu memory during rt6_release() After rwlock is replaced with rcu and spinlock, route lookup can happen simultanously with route deletion. This patch removes the call to free_percpu(rt->rt6i_pcpu) from rt6_release() to avoid the race condition between rt6_release() and rt6_get_pcpu_route(). And as free_percpu(rt->rt6i_pcpu) is already called in ip6_dst_destroy() after the rcu grace period, it is safe to do this change. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9c8e704e6af7..eee392f7b1f6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -190,9 +190,6 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) *ppcpu_rt = NULL; } } - - free_percpu(non_pcpu_rt->rt6i_pcpu); - non_pcpu_rt->rt6i_pcpu = NULL; } EXPORT_SYMBOL_GPL(rt6_free_pcpu); -- cgit v1.2.3 From d3843fe5fd45be0e04a251a2cc68893c859a31bd Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:06 -0700 Subject: ipv6: replace dst_hold() with dst_hold_safe() in routing code With rwlock, it is safe to call dst_hold() in the read thread because read thread is guaranteed to be separated from write thread. However, after we replace rwlock with rcu, it is no longer safe to use dst_hold(). A dst might already have been deleted but is waiting for the rcu grace period to pass before freeing the memory when a read thread is trying to do dst_hold(). This could potentially cause double free issue. So this commit replaces all dst_hold() with dst_hold_safe() in all read thread to avoid this double free issue. And in order to make the code more compact, a new function ip6_hold_safe() is introduced. It calls dst_hold_safe() first, and if that fails, it will either fall back to hold and return net->ipv6.ip6_null_entry or set rt to NULL according to the caller's need. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 ++- net/ipv6/route.c | 71 +++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 873afafddfc4..f86e931d555e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2333,7 +2333,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, continue; if ((rt->rt6i_flags & noflags) != 0) continue; - dst_hold(&rt->dst); + if (!dst_hold_safe(&rt->dst)) + rt = NULL; break; } out: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 941c062389d2..aeb349aea429 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -874,6 +874,23 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn, } } +static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, + bool null_fallback) +{ + struct rt6_info *rt = *prt; + + if (dst_hold_safe(&rt->dst)) + return true; + if (null_fallback) { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } else { + rt = NULL; + } + *prt = rt; + return false; +} + static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) @@ -898,7 +915,9 @@ restart: if (rt_cache) rt = rt_cache; - dst_use(&rt->dst, jiffies); + if (ip6_hold_safe(net, &rt, true)) + dst_use_noref(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -1061,10 +1080,9 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; - if (pcpu_rt) { - dst_hold(&pcpu_rt->dst); + if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) rt6_dst_from_metrics_check(pcpu_rt); - } + return pcpu_rt; } @@ -1625,12 +1643,17 @@ redo_rt6_select: if (rt_cache) rt = rt_cache; - if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { - dst_use(&rt->dst, jiffies); + if (rt == net->ipv6.ip6_null_entry) { + read_unlock_bh(&table->tb6_lock); + dst_hold(&rt->dst); + trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + return rt; + } else if (rt->rt6i_flags & RTF_CACHE) { + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + rt6_dst_from_metrics_check(rt); + } read_unlock_bh(&table->tb6_lock); - - rt6_dst_from_metrics_check(rt); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && @@ -1643,7 +1666,13 @@ redo_rt6_select: struct rt6_info *uncached_rt; - dst_use(&rt->dst, jiffies); + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + } else { + read_unlock_bh(&table->tb6_lock); + uncached_rt = rt; + goto uncached_rt_out; + } read_unlock_bh(&table->tb6_lock); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); @@ -1659,6 +1688,7 @@ redo_rt6_select: dst_hold(&uncached_rt->dst); } +uncached_rt_out: trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); return uncached_rt; @@ -1667,8 +1697,7 @@ redo_rt6_select: struct rt6_info *pcpu_rt; - rt->dst.lastuse = jiffies; - rt->dst.__use++; + dst_use_noref(&rt->dst, jiffies); pcpu_rt = rt6_get_pcpu_route(rt); if (pcpu_rt) { @@ -2130,7 +2159,7 @@ restart: } out: - dst_hold(&rt->dst); + ip6_hold_safe(net, &rt, true); read_unlock_bh(&table->tb6_lock); @@ -2841,7 +2870,8 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) continue; - dst_hold(&rt->dst); + if (!dst_hold_safe(&rt->dst)) + break; read_unlock_bh(&table->tb6_lock); /* if gateway was specified only delete the one hop */ @@ -3038,7 +3068,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, continue; if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) continue; - dst_hold(&rt->dst); + ip6_hold_safe(NULL, &rt, false); break; } out: @@ -3096,7 +3126,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev break; } if (rt) - dst_hold(&rt->dst); + ip6_hold_safe(NULL, &rt, false); read_unlock_bh(&table->tb6_lock); return rt; } @@ -3139,9 +3169,12 @@ restart: for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - ip6_del_rt(rt); + if (dst_hold_safe(&rt->dst)) { + read_unlock_bh(&table->tb6_lock); + ip6_del_rt(rt); + } else { + read_unlock_bh(&table->tb6_lock); + } goto restart; } } -- cgit v1.2.3 From bbd63f06d114a52be33f6982fc89ca2768cdeb62 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:07 -0700 Subject: ipv6: update fn_sernum after route is inserted to tree fib6_add() logic currently calls fib6_add_1() to figure out what node should be used for the newly added route and then call fib6_add_rt2node() to insert the route to the node. And during the call of fib6_add_1(), fn_sernum is updated for all nodes that share the same prefix as the new route. This does not have issue in the current code because reader thread will not be able to access the tree while writer thread is inserting new route to it. However, it is not the case once we transition to use RCU. Reader thread could potentially see the new fn_sernum before the new route is inserted. As a result, reader thread's route lookup will return a stale route with the new fn_sernum. In order to solve this issue, we remove all the update of fn_sernum in fib6_add_1(), and instead, introduce a new function that updates fn_sernum for all related nodes and call this functions once the route is successfully inserted to the tree. Also, smp_wmb() is used after a route is successfully inserted into the fib tree and right before the updated of fn->sernum. And smp_rmb() is used right after fn->sernum is accessed in rt6_get_cookie_safe(). This is to guarantee that when the reader thread sees the new fn->sernum, the new route is already inserted in the tree in memory. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index eee392f7b1f6..f604b311cc3e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -585,7 +585,7 @@ out: static struct fib6_node *fib6_add_1(struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, - int replace_required, int sernum, + int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; @@ -631,8 +631,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, fn->leaf = NULL; } - fn->fn_sernum = sernum; - return fn; } @@ -641,7 +639,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, */ /* Try to walk down on tree. */ - fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? fn->right : fn->left; @@ -677,7 +674,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, ln->fn_bit = plen; ln->parent = pn; - ln->fn_sernum = sernum; if (dir) pn->right = ln; @@ -737,8 +733,6 @@ insert_above: in->leaf = fn->leaf; atomic_inc(&in->leaf->rt6i_ref); - in->fn_sernum = sernum; - /* update parent pointer */ if (dir) pn->right = in; @@ -750,8 +744,6 @@ insert_above: ln->parent = in; fn->parent = in; - ln->fn_sernum = sernum; - if (addr_bit_set(addr, bit)) { in->right = ln; in->left = fn; @@ -776,8 +768,6 @@ insert_above: ln->parent = pn; - ln->fn_sernum = sernum; - if (dir) pn->right = ln; else @@ -1079,6 +1069,20 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } +static void fib6_update_sernum_upto_root(struct rt6_info *rt, + int sernum) +{ + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + + /* paired with smp_rmb() in rt6_get_cookie_safe() */ + smp_wmb(); + while (fn) { + fn->fn_sernum = sernum; + fn = fn->parent; + } +} + /* * Add routing information to the routing tree. * / @@ -1111,7 +1115,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, - replace_required, sernum, extack); + replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; @@ -1145,15 +1149,13 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); sfn->fn_flags = RTN_ROOT; - sfn->fn_sernum = sernum; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated @@ -1172,8 +1174,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); @@ -1190,8 +1191,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, #endif err = fib6_add_rt2node(fn, rt, info, mxc); - if (!err) + if (!err) { + fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); + } out: if (err) { -- cgit v1.2.3 From 8d1040e808bb2a5aeb4f0791b32bc7356a01ab11 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:08 -0700 Subject: ipv6: check fn->leaf before it is used If rwlock is replaced with rcu and spinlock, it is possible that the reader thread will see fn->leaf as NULL in the following scenarios: 1. fib6_add() is in progress and we have already inserted a new node but not yet inserted the route. 2. fib6_del_route() is in progress and we have already set fn->leaf to NULL but not yet freed the node because of rcu grace period. This patch makes sure all the reader threads check fn->leaf first before using it. And together with later patch to grab rcu_read_lock() and rcu_dereference() fn->leaf, it makes sure reader threads are safe when accessing fn->leaf. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 23 ++++++++++++++++++----- net/ipv6/route.c | 20 ++++++++++++-------- 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f604b311cc3e..cf6137e81408 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1279,10 +1279,13 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, while (fn) { if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { + struct rt6_info *leaf = fn->leaf; struct rt6key *key; - key = (struct rt6key *) ((u8 *) fn->leaf + - args->offset); + if (!leaf) + goto backtrack; + + key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES @@ -1299,9 +1302,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, return fn; } } -#ifdef CONFIG_IPV6_SUBTREES backtrack: -#endif if (fn->fn_flags & RTN_ROOT) break; @@ -1358,7 +1359,18 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = fn->leaf; + struct rt6key *key; + + /* This node is being deleted */ + if (!leaf) { + if (plen <= fn->fn_bit) + goto out; + else + goto next; + } + + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match @@ -1372,6 +1384,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, prev = fn; +next: /* * We have more bits to go */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aeb349aea429..05dc450af441 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -712,6 +712,7 @@ out: } static struct rt6_info *find_rr_leaf(struct fib6_node *fn, + struct rt6_info *leaf, struct rt6_info *rr_head, u32 metric, int oif, int strict, bool *do_rr) @@ -730,7 +731,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = find_match(rt, oif, strict, &mpri, match, do_rr); } - for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -748,17 +749,21 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, return match; } -static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) +static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, + int oif, int strict) { + struct rt6_info *leaf = fn->leaf; struct rt6_info *match, *rt0; - struct net *net; bool do_rr = false; + if (!leaf) + return net->ipv6.ip6_null_entry; + rt0 = fn->rr_ptr; if (!rt0) - fn->rr_ptr = rt0 = fn->leaf; + fn->rr_ptr = rt0 = leaf; - match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, + match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); if (do_rr) { @@ -766,13 +771,12 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) - next = fn->leaf; + next = leaf; if (next != rt0) fn->rr_ptr = next; } - net = dev_net(rt0->dst.dev); return match ? match : net->ipv6.ip6_null_entry; } @@ -1623,7 +1627,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - rt = rt6_select(fn, oif, strict); + rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) rt = rt6_multipath_select(rt, fl6, oif, strict); if (rt == net->ipv6.ip6_null_entry) { -- cgit v1.2.3 From 17ecf590b3cba19d9ecb410e340aa78128382abb Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:09 -0700 Subject: ipv6: add key length check into rt6_select() After rwlock is replaced with rcu and spinlock, fib6_lookup() could potentially return an intermediate node if other thread is doing fib6_del() on a route which is the only route on the node so that fib6_repair_tree() will be called on this node and potentially assigns fn->leaf to the its child's fn->leaf. In order to detect this situation in rt6_select(), we have to check if fn->fn_bit is consistent with the key length stored in the route. And depending on if the fn is in the subtree or not, the key is either rt->rt6i_dst or rt->rt6i_src. If any inconsistency is found, that means the node no longer holds valid routes in it. So net->ipv6.ip6_null_entry is returned. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 05dc450af441..24b80f43bbfb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -755,6 +755,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, struct rt6_info *leaf = fn->leaf; struct rt6_info *match, *rt0; bool do_rr = false; + int key_plen; if (!leaf) return net->ipv6.ip6_null_entry; @@ -763,6 +764,19 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, if (!rt0) fn->rr_ptr = rt0 = leaf; + /* Double check to make sure fn is not an intermediate node + * and fn->leaf does not points to its child's leaf + * (This might happen if all routes under fn are deleted from + * the tree and fib6_repair_tree() is called on the node.) + */ + key_plen = rt0->rt6i_dst.plen; +#ifdef CONFIG_IPV6_SUBTREES + if (rt0->rt6i_src.plen) + key_plen = rt0->rt6i_src.plen; +#endif + if (fn->fn_bit != key_plen) + return net->ipv6.ip6_null_entry; + match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); -- cgit v1.2.3 From 66f5d6ce53e665477d2a33e8f539d4fa4ca81c83 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:10 -0700 Subject: ipv6: replace rwlock with rcu and spinlock in fib6_table With all the preparation work before, we are now ready to replace rwlock with rcu and spinlock in fib6_table. That means now all fib6_node in fib6_table are protected by rcu. And when freeing fib6_node, call_rcu() is used to wait for the rcu grace period before releasing the memory. When accessing fib6_node, corresponding rcu APIs need to be used. And all previous sessions protected by the write lock will now be protected by the spin lock per table. All previous sessions protected by read lock will now be protected by rcu_read_lock(). A couple of things to note here: 1. As part of the work of replacing rwlock with rcu, the linked list of fn->leaf now has to be rcu protected as well. So both fn->leaf and rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are used when manipulating them. 2. For fn->rr_ptr, first of all, it also needs to be rcu protected now and is tagged with __rcu and rcu APIs are used in corresponding places. Secondly, fn->rr_ptr is changed in rt6_select() which is a reader thread. This makes the issue a bit complicated. We think a valid solution for it is to let rt6_select() grab the tb6_lock if it decides to change it. As it is not in the normal operation and only happens when there is no valid neighbor cache for the route, we think the performance impact should be low. 3. fib6_walk_continue() has to be called with tb6_lock held even in the route dumping related functions, e.g. inet6_dump_fib(), fib6_tables_dump() and ipv6_route_seq_ops. It is because fib6_walk_continue() makes modifications to the walker structure, and so are fib6_repair_tree() and fib6_del_route(). In order to do proper syncing between them, we need to let fib6_walk_continue() hold the lock. We may be able to do further improvement on the way we do the tree walk to get rid of the need for holding the spin lock. But not for now. 4. When fib6_del_route() removes a route from the tree, we no longer mark rt->dst.rt6_next to NULL to make simultaneous reader be able to further traverse the list with rcu. However, rt->dst.rt6_next is only valid within this same rcu period. No one should access it later. 5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be performed before we publish this route (either by linking it to fn->leaf or insert it in the list pointed by fn->leaf) just to be safe because as soon as we publish the route, some read thread will be able to access it. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 11 +- net/ipv6/ip6_fib.c | 405 +++++++++++++++++++++++++++++++--------------------- net/ipv6/route.c | 121 +++++++++------- 3 files changed, 316 insertions(+), 221 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f86e931d555e..9854d93e45bb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2321,12 +2321,12 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != dev->ifindex) continue; if ((rt->rt6i_flags & flags) != flags) @@ -2338,7 +2338,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -5898,10 +5898,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) spin_lock(&ifa->lock); if (ifa->rt) { struct rt6_info *rt = ifa->rt; - struct fib6_table *table = rt->rt6i_table; int cpu; - read_lock(&table->tb6_lock); + rcu_read_lock(); addrconf_set_nopolicy(ifa->rt, val); if (rt->rt6i_pcpu) { for_each_possible_cpu(cpu) { @@ -5911,7 +5910,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) addrconf_set_nopolicy(*rtp, val); } } - read_unlock(&table->tb6_lock); + rcu_read_unlock(); } spin_unlock(&ifa->lock); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cf6137e81408..3f95908b39c3 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -54,8 +54,12 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); -static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); +static struct fib6_node *fib6_repair_tree(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); @@ -107,12 +111,12 @@ void fib6_update_sernum(struct rt6_info *rt) struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); } /* @@ -207,8 +211,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ - rwlock_init(&tb->tb6_lock); - + spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* @@ -227,7 +230,8 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; - table->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(table->tb6_root.leaf, + net->ipv6.ip6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } @@ -324,11 +328,8 @@ unsigned int fib6_tables_seq_read(struct net *net) struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - read_lock_bh(&tb->tb6_lock); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; - read_unlock_bh(&tb->tb6_lock); - } } rcu_read_unlock(); @@ -374,7 +375,7 @@ static int fib6_node_dump(struct fib6_walker *w) { struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) + for_each_fib6_walker_rt(w) fib6_rt_dump(rt, w->args); w->leaf = NULL; return 0; @@ -384,9 +385,9 @@ static void fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { w->root = &tb->tb6_root; - read_lock_bh(&tb->tb6_lock); + spin_lock_bh(&tb->tb6_lock); fib6_walk(net, w); - read_unlock_bh(&tb->tb6_lock); + spin_unlock_bh(&tb->tb6_lock); } /* Called with rcu_read_lock() */ @@ -423,7 +424,7 @@ static int fib6_dump_node(struct fib6_walker *w) int res; struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args); if (res < 0) { /* Frame is full, suspend walking */ @@ -482,9 +483,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->count = 0; w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = w->root->fn_sernum; @@ -499,9 +500,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, } else w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; @@ -582,11 +583,12 @@ out: * node. */ -static struct fib6_node *fib6_add_1(struct fib6_node *root, - struct in6_addr *addr, int plen, - int offset, int allow_create, - int replace_required, - struct netlink_ext_ack *extack) +static struct fib6_node *fib6_add_1(struct fib6_table *table, + struct fib6_node *root, + struct in6_addr *addr, int plen, + int offset, int allow_create, + int replace_required, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -601,7 +603,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, fn = root; do { - key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match @@ -627,8 +631,8 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { - rt6_release(fn->leaf); - fn->leaf = NULL; + RCU_INIT_POINTER(fn->leaf, NULL); + rt6_release(leaf); } return fn; @@ -641,7 +645,11 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; - fn = dir ? fn->right : fn->left; + fn = dir ? + rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)) : + rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { @@ -672,13 +680,12 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; - - ln->parent = pn; + RCU_INIT_POINTER(ln->parent, pn); if (dir) - pn->right = ln; + rcu_assign_pointer(pn->right, ln); else - pn->left = ln; + rcu_assign_pointer(pn->left, ln); return ln; @@ -692,7 +699,8 @@ insert_above: * and the current */ - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. @@ -729,27 +737,28 @@ insert_above: in->fn_bit = bit; - in->parent = pn; + RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; - atomic_inc(&in->leaf->rt6i_ref); + atomic_inc(&rcu_dereference_protected(in->leaf, + lockdep_is_held(&table->tb6_lock))->rt6i_ref); /* update parent pointer */ if (dir) - pn->right = in; + rcu_assign_pointer(pn->right, in); else - pn->left = in; + rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; - ln->parent = in; - fn->parent = in; + RCU_INIT_POINTER(ln->parent, in); + rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { - in->right = ln; - in->left = fn; + rcu_assign_pointer(in->right, ln); + rcu_assign_pointer(in->left, fn); } else { - in->left = ln; - in->right = fn; + rcu_assign_pointer(in->left, ln); + rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ @@ -766,19 +775,19 @@ insert_above: ln->fn_bit = plen; - ln->parent = pn; - - if (dir) - pn->right = ln; - else - pn->left = ln; + RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) - ln->right = fn; + RCU_INIT_POINTER(ln->right, fn); else - ln->left = fn; + RCU_INIT_POINTER(ln->left, fn); + + rcu_assign_pointer(fn->parent, ln); - fn->parent = ln; + if (dir) + rcu_assign_pointer(pn->right, ln); + else + rcu_assign_pointer(pn->left, ln); } return ln; } @@ -824,6 +833,8 @@ static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc) static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, struct net *net) { + struct fib6_table *table = rt->rt6i_table; + if (atomic_read(&rt->rt6i_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, @@ -832,12 +843,17 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * to still alive ones. */ while (fn) { - if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { - fn->leaf = fib6_find_prefix(net, fn); - atomic_inc(&fn->leaf->rt6i_ref); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_leaf; + if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { + new_leaf = fib6_find_prefix(net, table, fn); + atomic_inc(&new_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_leaf); rt6_release(rt); } - fn = fn->parent; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); } } } @@ -849,9 +865,11 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc) { + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct rt6_info *iter = NULL; - struct rt6_info **ins; - struct rt6_info **fallback_ins = NULL; + struct rt6_info __rcu **ins; + struct rt6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -866,7 +884,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; - for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { + for (iter = leaf; iter; + iter = rcu_dereference_protected(iter->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock))) { /* * Search for duplicates */ @@ -928,7 +948,8 @@ next_iter: if (fallback_ins && !found) { /* No ECMP-able route found, replace first non-ECMP one */ ins = fallback_ins; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); found++; } @@ -942,7 +963,7 @@ next_iter: struct rt6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ - sibling = fn->leaf; + sibling = leaf; while (sibling) { if (sibling->rt6i_metric == rt->rt6i_metric && rt6_qualify_for_ecmp(sibling)) { @@ -950,7 +971,8 @@ next_iter: &sibling->rt6i_siblings); break; } - sibling = sibling->dst.rt6_next; + sibling = rcu_dereference_protected(sibling->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings @@ -979,10 +1001,10 @@ add: if (err) return err; - rt->dst.rt6_next = iter; - *ins = rt; - rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(rt->dst.rt6_next, iter); atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, rt); if (!info->skip_notify) @@ -1008,10 +1030,10 @@ add: if (err) return err; - *ins = rt; + atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; - atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt); if (!info->skip_notify) @@ -1023,14 +1045,15 @@ add: nsiblings = iter->rt6i_nsiblings; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->dst.rt6_next; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; @@ -1038,14 +1061,15 @@ add: *ins = iter->dst.rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; } else { ins = &iter->dst.rt6_next; } - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } WARN_ON(nsiblings != 0); } @@ -1079,7 +1103,8 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, smp_wmb(); while (fn) { fn->fn_sernum = sernum; - fn = fn->parent; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } } @@ -1087,12 +1112,14 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, * Add routing information to the routing tree. * / * with source addr info in sub-trees + * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc, struct netlink_ext_ack *extack) { + struct fib6_table *table = rt->rt6i_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; @@ -1113,7 +1140,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, + fn = fib6_add_1(table, root, + &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { @@ -1128,7 +1156,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (rt->rt6i_src.plen) { struct fib6_node *sn; - if (!fn->subtree) { + if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* @@ -1146,13 +1174,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!sfn) goto failure; - sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + rcu_assign_pointer(sfn->leaf, + info->nl_net->ipv6.ip6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ - sn = fib6_add_1(sfn, &rt->rt6i_src.addr, + sn = fib6_add_1(table, sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1168,10 +1197,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } /* Now link new subtree to main tree */ - sfn->parent = fn; - fn->subtree = sfn; + rcu_assign_pointer(sfn->parent, fn); + rcu_assign_pointer(fn->subtree, sfn); } else { - sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, + sn = fib6_add_1(table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1182,9 +1211,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } } - if (!fn->leaf) { - fn->leaf = rt; + if (!rcu_access_pointer(fn->leaf)) { atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); } fn = sn; } @@ -1203,19 +1232,23 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - if (pn != fn && pn->leaf == rt) { - pn->leaf = NULL; + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn != fn && pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); atomic_dec(&rt->rt6i_ref); } - if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn->leaf = fib6_find_prefix(info->nl_net, pn); + if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, pn); #if RT6_DEBUG >= 2 - if (!pn->leaf) { - WARN_ON(pn->leaf == NULL); - pn->leaf = info->nl_net->ipv6.ip6_null_entry; + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = info->nl_net->ipv6.ip6_null_entry; } #endif - atomic_inc(&pn->leaf->rt6i_ref); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); } #endif goto failure; @@ -1230,7 +1263,7 @@ failure: * fn->leaf. */ if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) - fib6_repair_tree(info->nl_net, fn); + fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function */ @@ -1268,7 +1301,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, dir = addr_bit_set(args->addr, fn->fn_bit); - next = dir ? fn->right : fn->left; + next = dir ? rcu_dereference(fn->right) : + rcu_dereference(fn->left); if (next) { fn = next; @@ -1278,8 +1312,10 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, } while (fn) { - if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { - struct rt6_info *leaf = fn->leaf; + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree || fn->fn_flags & RTN_RTINFO) { + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) @@ -1289,10 +1325,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) { + if (subtree) { struct fib6_node *sfn; - sfn = fib6_lookup_1(fn->subtree, - args + 1); + sfn = fib6_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; @@ -1306,12 +1341,14 @@ backtrack: if (fn->fn_flags & RTN_ROOT) break; - fn = fn->parent; + fn = rcu_dereference(fn->parent); } return NULL; } +/* called with rcu_read_lock() held + */ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { @@ -1359,7 +1396,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6_info *leaf = fn->leaf; + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ @@ -1389,9 +1426,9 @@ next: * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) - fn = fn->right; + fn = rcu_dereference(fn->right); else - fn = fn->left; + fn = rcu_dereference(fn->left); } out: if (exact_match) @@ -1413,9 +1450,11 @@ struct fib6_node *fib6_locate(struct fib6_node *root, #ifdef CONFIG_IPV6_SUBTREES if (src_len) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + WARN_ON(saddr == NULL); - if (fn && fn->subtree) - fn = fib6_locate_1(fn->subtree, saddr, src_len, + if (fn && subtree) + fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct rt6_info, rt6i_src), exact_match); } @@ -1433,16 +1472,26 @@ struct fib6_node *fib6_locate(struct fib6_node *root, * */ -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn) { + struct fib6_node *child_left, *child_right; + if (fn->fn_flags & RTN_ROOT) return net->ipv6.ip6_null_entry; while (fn) { - if (fn->left) - return fn->left->leaf; - if (fn->right) - return fn->right->leaf; + child_left = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + child_right = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + if (child_left) + return rcu_dereference_protected(child_left->leaf, + lockdep_is_held(&table->tb6_lock)); + if (child_right) + return rcu_dereference_protected(child_right->leaf, + lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } @@ -1452,31 +1501,49 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. + * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, - struct fib6_node *fn) + struct fib6_table *table, + struct fib6_node *fn) { int children; int nstate; - struct fib6_node *child, *pn; + struct fib6_node *child; struct fib6_walker *w; int iter = 0; for (;;) { + struct fib6_node *fn_r = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *fn_l = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_r = rcu_dereference_protected(pn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_l = rcu_dereference_protected(pn->left, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_fn_leaf; + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); - WARN_ON(fn->leaf); + WARN_ON(fn_leaf); children = 0; child = NULL; - if (fn->right) - child = fn->right, children |= 1; - if (fn->left) - child = fn->left, children |= 2; + if (fn_r) + child = fn_r, children |= 1; + if (fn_l) + child = fn_l, children |= 2; if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES @@ -1484,36 +1551,36 @@ static struct fib6_node *fib6_repair_tree(struct net *net, || (children && fn->fn_flags & RTN_ROOT) #endif ) { - fn->leaf = fib6_find_prefix(net, fn); + new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 - if (!fn->leaf) { - WARN_ON(!fn->leaf); - fn->leaf = net->ipv6.ip6_null_entry; + if (!new_fn_leaf) { + WARN_ON(!new_fn_leaf); + new_fn_leaf = net->ipv6.ip6_null_entry; } #endif - atomic_inc(&fn->leaf->rt6i_ref); - return fn->parent; + atomic_inc(&new_fn_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_fn_leaf); + return pn; } - pn = fn->parent; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); - FIB6_SUBTREE(pn) = NULL; + RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif - if (pn->right == fn) - pn->right = child; - else if (pn->left == fn) - pn->left = child; + if (pn_r == fn) + rcu_assign_pointer(pn->right, child); + else if (pn_l == fn) + rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) - child->parent = pn; + rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } @@ -1546,17 +1613,18 @@ static struct fib6_node *fib6_repair_tree(struct net *net, if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; - rt6_release(pn->leaf); - pn->leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + rt6_release(pn_leaf); fn = pn; } } -static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, - struct nl_info *info) +static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, + struct rt6_info __rcu **rtp, struct nl_info *info) { struct fib6_walker *w; - struct rt6_info *rt = *rtp; + struct rt6_info *rt = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; RT6_TRACE("fib6_del_route\n"); @@ -1573,7 +1641,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt6_flush_exceptions(rt); /* Reset round-robin state, if necessary */ - if (fn->rr_ptr == rt) + if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ @@ -1592,20 +1660,19 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rt->dst.rt6_next; + w->leaf = rcu_dereference_protected(rt->dst.rt6_next, + lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); - rt->dst.rt6_next = NULL; - /* If it was last route, expunge its radix tree node */ - if (!fn->leaf) { + if (!rcu_access_pointer(fn->leaf)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; - fn = fib6_repair_tree(net, fn); + fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); @@ -1616,12 +1683,15 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt6_release(rt); } +/* Need to own table->tb6_lock */ int fib6_del(struct rt6_info *rt, struct nl_info *info) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); + struct fib6_table *table = rt->rt6i_table; struct net *net = info->nl_net; - struct rt6_info **rtp; + struct rt6_info __rcu **rtp; + struct rt6_info __rcu **rtp_next; #if RT6_DEBUG >= 2 if (rt->dst.obsolete > 0) { @@ -1642,11 +1712,14 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * Walk the leaf entries looking for ourself */ - for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { - if (*rtp == rt) { - fib6_del_route(fn, rtp, info); + for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { + struct rt6_info *cur = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); + if (rt == cur) { + fib6_del_route(table, fn, rtp, info); return 0; } + rtp_next = &cur->dst.rt6_next; } return -ENOENT; } @@ -1673,11 +1746,13 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. + * + * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { - struct fib6_node *fn, *pn; + struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); @@ -1697,20 +1772,22 @@ static int fib6_walk_continue(struct fib6_walker *w) w->state = FWS_L; #endif case FWS_L: - if (fn->left) { - w->node = fn->left; + left = rcu_dereference_protected(fn->left, 1); + if (left) { + w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; case FWS_R: - if (fn->right) { - w->node = fn->right; + right = rcu_dereference_protected(fn->right, 1); + if (right) { + w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; - w->leaf = fn->leaf; + w->leaf = rcu_dereference_protected(fn->leaf, 1); case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; @@ -1732,7 +1809,9 @@ skip: case FWS_U: if (fn == w->root) return 0; - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, 1); + left = rcu_dereference_protected(pn->left, 1); + right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { @@ -1741,13 +1820,13 @@ skip: continue; } #endif - if (pn->left == fn) { + if (left == fn) { w->state = FWS_R; continue; } - if (pn->right == fn) { + if (right == fn) { w->state = FWS_C; - w->leaf = w->node->leaf; + w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 @@ -1790,7 +1869,7 @@ static int fib6_clean_node(struct fib6_walker *w) return 0; } - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; @@ -1850,10 +1929,10 @@ static void __fib6_clean_all(struct net *net, for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); @@ -1967,7 +2046,8 @@ static int __net_init fib6_net_init(struct net *net) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; - net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); @@ -1978,7 +2058,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; - net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); @@ -2108,7 +2189,9 @@ static int ipv6_route_yield(struct fib6_walker *w) return 1; do { - iter->w.leaf = iter->w.leaf->dst.rt6_next; + iter->w.leaf = rcu_dereference_protected( + iter->w.leaf->dst.rt6_next, + lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; @@ -2173,7 +2256,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = ((struct rt6_info *)v)->dst.rt6_next; + n = rcu_dereference(((struct rt6_info *)v)->dst.rt6_next); if (n) { ++*pos; return n; @@ -2181,9 +2264,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) iter_table: ipv6_route_check_sernum(iter); - read_lock(&iter->tbl->tb6_lock); + spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); - read_unlock(&iter->tbl->tb6_lock); + spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { if (v) ++*pos; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 24b80f43bbfb..cf44d0994b1e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -488,7 +488,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, } /* - * Route lookup. Any table->tb6_lock is implied. + * Route lookup. rcu_read_lock() should be held. */ static inline struct rt6_info *rt6_device_match(struct net *net, @@ -503,7 +503,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (!oif && ipv6_addr_any(saddr)) goto out; - for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { struct net_device *dev = sprt->dst.dev; if (oif) { @@ -722,7 +722,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -731,7 +731,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = find_match(rt, oif, strict, &mpri, match, do_rr); } - for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + for (rt = leaf; rt && rt != rr_head; + rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -743,7 +744,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rt->dst.rt6_next) + for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -752,7 +753,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, int oif, int strict) { - struct rt6_info *leaf = fn->leaf; + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6_info *match, *rt0; bool do_rr = false; int key_plen; @@ -760,9 +761,9 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, if (!leaf) return net->ipv6.ip6_null_entry; - rt0 = fn->rr_ptr; + rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) - fn->rr_ptr = rt0 = leaf; + rt0 = leaf; /* Double check to make sure fn is not an intermediate node * and fn->leaf does not points to its child's leaf @@ -781,14 +782,19 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct rt6_info *next = rt0->dst.rt6_next; + struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) next = leaf; - if (next != rt0) - fn->rr_ptr = next; + if (next != rt0) { + spin_lock_bh(&leaf->rt6i_table->tb6_lock); + /* make sure next is not being deleted from the tree */ + if (next->rt6i_node) + rcu_assign_pointer(fn->rr_ptr, next); + spin_unlock_bh(&leaf->rt6i_table->tb6_lock); + } } return match ? match : net->ipv6.ip6_null_entry; @@ -878,13 +884,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { - struct fib6_node *pn; + struct fib6_node *pn, *sn; while (1) { if (fn->fn_flags & RTN_TL_ROOT) return NULL; - pn = fn->parent; - if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) - fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); + pn = rcu_dereference(fn->parent); + sn = FIB6_SUBTREE(pn); + if (sn && sn != fn) + fn = fib6_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) @@ -916,13 +923,19 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct rt6_info *rt, *rt_cache; struct fib6_node *fn; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - rt = fn->leaf; - rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); - if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); + rt = rcu_dereference(fn->leaf); + if (!rt) { + rt = net->ipv6.ip6_null_entry; + } else { + rt = rt6_device_match(net, rt, &fl6->saddr, + fl6->flowi6_oif, flags); + if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) + rt = rt6_multipath_select(rt, fl6, + fl6->flowi6_oif, flags); + } if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -936,7 +949,7 @@ restart: if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -990,9 +1003,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, struct fib6_table *table; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, mxc, extack); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); return err; } @@ -1090,7 +1103,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) return pcpu_rt; } -/* It should be called with read_lock_bh(&tb6_lock) acquired */ +/* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) { struct rt6_info *pcpu_rt, **p; @@ -1632,7 +1645,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1662,7 +1675,7 @@ redo_rt6_select: rt = rt_cache; if (rt == net->ipv6.ip6_null_entry) { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); dst_hold(&rt->dst); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -1671,7 +1684,7 @@ redo_rt6_select: dst_use_noref(&rt->dst, jiffies); rt6_dst_from_metrics_check(rt); } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && @@ -1687,11 +1700,11 @@ redo_rt6_select: if (ip6_hold_safe(net, &rt, true)) { dst_use_noref(&rt->dst, jiffies); } else { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); uncached_rt = rt; goto uncached_rt_out; } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); @@ -1719,7 +1732,7 @@ uncached_rt_out: pcpu_rt = rt6_get_pcpu_route(rt); if (pcpu_rt) { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); } else { /* atomic_inc_not_zero() is needed when using rcu */ if (atomic_inc_not_zero(&rt->rt6i_ref)) { @@ -1730,12 +1743,12 @@ uncached_rt_out: * No dst_hold() on rt is needed because grabbing * rt->rt6i_ref makes sure rt can't be released. */ - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); pcpu_rt = rt6_make_pcpu_route(rt); rt6_release(rt); } else { /* rt is already removed from tree */ - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); pcpu_rt = net->ipv6.ip6_null_entry; dst_hold(&pcpu_rt->dst); } @@ -2131,10 +2144,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, * routes. */ - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt6_check_expired(rt)) continue; if (rt->dst.error) @@ -2179,7 +2192,7 @@ restart: out: ip6_hold_safe(net, &rt, true); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -2778,9 +2791,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) } table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out: ip6_rt_put(rt); @@ -2806,7 +2819,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (rt == net->ipv6.ip6_null_entry) goto out_put; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { struct rt6_info *sibling, *next_sibling; @@ -2836,7 +2849,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) err = fib6_del(rt, info); out_unlock: - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out_put: ip6_rt_put(rt); @@ -2861,7 +2874,7 @@ static int ip6_route_del(struct fib6_config *cfg, return err; } - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, @@ -2869,7 +2882,7 @@ static int ip6_route_del(struct fib6_config *cfg, !(cfg->fc_flags & RTF_CACHE)); if (fn) { - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (cfg->fc_flags & RTF_CACHE) { rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, &cfg->fc_src); @@ -2890,7 +2903,7 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (!dst_hold_safe(&rt->dst)) break; - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) @@ -2899,7 +2912,7 @@ static int ip6_route_del(struct fib6_config *cfg, return __ip6_del_rt_siblings(rt, cfg); } } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return err; } @@ -3074,12 +3087,12 @@ static struct rt6_info *rt6_get_route_info(struct net *net, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) @@ -3090,7 +3103,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -3136,8 +3149,8 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev if (!table) return NULL; - read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + rcu_read_lock(); + for_each_fib6_node_rt_rcu(&table->tb6_root) { if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) @@ -3145,7 +3158,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev } if (rt) ip6_hold_safe(NULL, &rt, false); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -3183,20 +3196,20 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table) struct rt6_info *rt; restart: - read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + rcu_read_lock(); + for_each_fib6_node_rt_rcu(&table->tb6_root) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { if (dst_hold_safe(&rt->dst)) { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); ip6_del_rt(rt); } else { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); } goto restart; } } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; } -- cgit v1.2.3 From 81eb8447daae3b62247aa66bb17b82f8fef68249 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:11 -0700 Subject: ipv6: take care of rt6_stats Currently, most of the rt6_stats are not hooked up correctly. As the last part of this patch series, hook up all existing rt6_stats and add one new stat fib_rt_uncache to indicate the number of routes in the uncached list. For details of the stats, please refer to the comments added in include/net/ip6_fib.h. Note: fib_rt_alloc and fib_rt_uncache are not guaranteed to be modified under a lock. So atomic_t is used for them. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 42 ++++++++++++++++++++++++------------------ net/ipv6/route.c | 16 ++++++++++++++-- 2 files changed, 38 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 3f95908b39c3..52a29ba32928 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -149,18 +149,21 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static struct fib6_node *node_alloc(void) +static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); + if (fn) + net->ipv6.rt6_stats->fib_nodes++; return fn; } -static void node_free_immediate(struct fib6_node *fn) +static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); + net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) @@ -170,9 +173,10 @@ static void node_free_rcu(struct rcu_head *head) kmem_cache_free(fib6_node_kmem, fn); } -static void node_free(struct fib6_node *fn) +static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); + net->ipv6.rt6_stats->fib_nodes--; } void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) @@ -583,7 +587,8 @@ out: * node. */ -static struct fib6_node *fib6_add_1(struct fib6_table *table, +static struct fib6_node *fib6_add_1(struct net *net, + struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, @@ -675,7 +680,7 @@ static struct fib6_node *fib6_add_1(struct fib6_table *table, * Create new leaf node without children. */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); @@ -716,14 +721,14 @@ insert_above: * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { - in = node_alloc(); - ln = node_alloc(); + in = node_alloc(net); + ln = node_alloc(net); if (!in || !ln) { if (in) - node_free_immediate(in); + node_free_immediate(net, in); if (ln) - node_free_immediate(ln); + node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } @@ -768,7 +773,7 @@ insert_above: * (old node)[fn] NULL */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); @@ -1065,6 +1070,7 @@ add: fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->dst.rt6_next; } @@ -1140,7 +1146,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(table, root, + fn = fib6_add_1(info->nl_net, table, root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, replace_required, extack); @@ -1170,7 +1176,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, */ /* Create subtree root node */ - sfn = node_alloc(); + sfn = node_alloc(info->nl_net); if (!sfn) goto failure; @@ -1181,8 +1187,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, /* Now add the first leaf node to new subtree */ - sn = fib6_add_1(table, sfn, &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, sfn, + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1191,7 +1197,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, root, and then (in failure) stale node in main tree. */ - node_free_immediate(sfn); + node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } @@ -1200,8 +1206,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { - sn = fib6_add_1(table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1609,7 +1615,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } read_unlock(&net->ipv6.fib6_walker_lock); - node_free(fn); + node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cf44d0994b1e..399d1bceec4a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -143,9 +143,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->rt6i_uncached)) { struct uncached_list *ul = rt->rt6i_uncached_list; + struct net *net = dev_net(rt->dst.dev); spin_lock_bh(&ul->lock); list_del(&rt->rt6i_uncached); + atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); spin_unlock_bh(&ul->lock); } } @@ -359,8 +361,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, flags); - if (rt) + if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); + } return rt; } @@ -1156,6 +1160,8 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { + struct net *net = dev_net(rt6_ex->rt6i->dst.dev); + if (!bucket || !rt6_ex) return; rt6_ex->rt6i->rt6i_node = NULL; @@ -1164,6 +1170,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; + net->ipv6.rt6_stats->fib_rt_cache--; } /* Remove oldest rt6_ex in bucket and free the memory @@ -1270,6 +1277,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, static int rt6_insert_exception(struct rt6_info *nrt, struct rt6_info *ort) { + struct net *net = dev_net(ort->dst.dev); struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1339,6 +1347,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, nrt->rt6i_node = ort->rt6i_node; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; + net->ipv6.rt6_stats->fib_rt_cache++; if (bucket->depth > FIB6_MAX_DEPTH) rt6_exception_remove_oldest(bucket); @@ -1714,6 +1723,7 @@ redo_rt6_select: * No need for another dst_hold() */ rt6_uncached_list_add(uncached_rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); } else { uncached_rt = net->ipv6.ip6_null_entry; dst_hold(&uncached_rt->dst); @@ -1894,6 +1904,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori DST_OBSOLETE_NONE, 0); if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); new = &rt->dst; new->__use = 1; @@ -2341,6 +2352,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, * do proper release of the net_device */ rt6_uncached_list_add(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); @@ -4422,7 +4434,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", net->ipv6.rt6_stats->fib_nodes, net->ipv6.rt6_stats->fib_route_nodes, - net->ipv6.rt6_stats->fib_rt_alloc, + atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, dst_entries_get_slow(&net->ipv6.ip6_dst_ops), -- cgit v1.2.3 From bdc476413dcdb5c38a7dec90fb2bca327021273a Mon Sep 17 00:00:00 2001 From: Amine Kherbouche Date: Wed, 4 Oct 2017 19:35:57 +0200 Subject: ip_tunnel: add mpls over gre support This commit introduces the MPLSoGRE support (RFC 4023), using ip tunnel API by simply adding ipgre_tunnel_encap_(add|del)_mpls_ops() and the new tunnel type TUNNEL_ENCAP_MPLS. Signed-off-by: Amine Kherbouche Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index c5b9ce41d66f..9745e8f69810 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_IPV6) #include @@ -39,6 +40,36 @@ static int one = 1; static int label_limit = (1 << 20) - 1; static int ttl_max = 255; +#if IS_ENABLED(CONFIG_NET_IP_TUNNEL) +size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e) +{ + return sizeof(struct mpls_shim_hdr); +} + +static const struct ip_tunnel_encap_ops mpls_iptun_ops = { + .encap_hlen = ipgre_mpls_encap_hlen, +}; + +static int ipgre_tunnel_encap_add_mpls_ops(void) +{ + return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); +} + +static void ipgre_tunnel_encap_del_mpls_ops(void) +{ + ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); +} +#else +static int ipgre_tunnel_encap_add_mpls_ops(void) +{ + return 0; +} + +static void ipgre_tunnel_encap_del_mpls_ops(void) +{ +} +#endif + static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags); @@ -2485,6 +2516,10 @@ static int __init mpls_init(void) 0); rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, mpls_netconf_dump_devconf, 0); + err = ipgre_tunnel_encap_add_mpls_ops(); + if (err) + pr_err("Can't add mpls over gre tunnel ops\n"); + err = 0; out: return err; @@ -2502,6 +2537,7 @@ static void __exit mpls_exit(void) dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); + ipgre_tunnel_encap_del_mpls_ops(); } module_exit(mpls_exit); -- cgit v1.2.3 From 64237470ddf97b63155fbd272c9e743e01d5f514 Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 01:37:29 +0800 Subject: net: phonet: mark header_ops as const Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- net/phonet/af_phonet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index f925753668a7..b12142e55d19 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -149,7 +149,7 @@ static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr) return 1; } -struct header_ops phonet_header_ops = { +const struct header_ops phonet_header_ops = { .create = pn_header_create, .parse = pn_header_parse, }; -- cgit v1.2.3 From 548ec114705bb8f0879a0da12abec17f17a7cc26 Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 01:40:35 +0800 Subject: net: phonet: mark phonet_protocol as const The phonet_protocol structs don't need to be written by anyone and so can be marked as const. Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- net/phonet/af_phonet.c | 15 ++++++++------- net/phonet/datagram.c | 2 +- net/phonet/pep.c | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index b12142e55d19..3b0ef691f5b1 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -35,11 +35,11 @@ #include /* Transport protocol registration */ -static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; +static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; -static struct phonet_protocol *phonet_proto_get(unsigned int protocol) +static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) { - struct phonet_protocol *pp; + const struct phonet_protocol *pp; if (protocol >= PHONET_NPROTO) return NULL; @@ -53,7 +53,7 @@ static struct phonet_protocol *phonet_proto_get(unsigned int protocol) return pp; } -static inline void phonet_proto_put(struct phonet_protocol *pp) +static inline void phonet_proto_put(const struct phonet_protocol *pp) { module_put(pp->prot->owner); } @@ -65,7 +65,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; struct pn_sock *pn; - struct phonet_protocol *pnp; + const struct phonet_protocol *pnp; int err; if (!capable(CAP_SYS_ADMIN)) @@ -470,7 +470,7 @@ static struct packet_type phonet_packet_type __read_mostly = { static DEFINE_MUTEX(proto_tab_lock); int __init_or_module phonet_proto_register(unsigned int protocol, - struct phonet_protocol *pp) + const struct phonet_protocol *pp) { int err = 0; @@ -492,7 +492,8 @@ int __init_or_module phonet_proto_register(unsigned int protocol, } EXPORT_SYMBOL(phonet_proto_register); -void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp) +void phonet_proto_unregister(unsigned int protocol, + const struct phonet_protocol *pp) { mutex_lock(&proto_tab_lock); BUG_ON(proto_tab[protocol] != pp); diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 5e710435ffa9..b44fb9018fb8 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -195,7 +195,7 @@ static struct proto pn_proto = { .name = "PHONET", }; -static struct phonet_protocol pn_dgram_proto = { +static const struct phonet_protocol pn_dgram_proto = { .ops = &phonet_dgram_ops, .prot = &pn_proto, .sock_type = SOCK_DGRAM, diff --git a/net/phonet/pep.c b/net/phonet/pep.c index e81537991ddf..9fc76b19cd3c 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1351,7 +1351,7 @@ static struct proto pep_proto = { .name = "PNPIPE", }; -static struct phonet_protocol pep_pn_proto = { +static const struct phonet_protocol pep_pn_proto = { .ops = &phonet_stream_ops, .prot = &pep_proto, .sock_type = SOCK_SEQPACKET, -- cgit v1.2.3 From f192970de860d3ab90aa9e2a22853201a57bde78 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 5 Oct 2017 12:07:12 -0700 Subject: ip_gre: check packet length and mtu correctly in erspan tx Similarly to early patch for erspan_xmit(), the ARPHDR_ETHER device is the length of the whole ether packet. So skb->len should subtract the dev->hard_header_len. Fixes: 1a66a836da63 ("gre: add collect_md mode to ERSPAN tunnel") Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: William Tu Cc: Xin Long Cc: David Laight Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index dc2317776499..c105a315b1a3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -579,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, false)) goto err_free_rt; - if (skb->len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -731,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len - dev->hard_header_len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } -- cgit v1.2.3 From 951f788a80ff8b6339c5c1ab888b0d4b4352efd8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Oct 2017 21:07:18 -0700 Subject: ipv6: fix a BUG in rt6_get_pcpu_route() Ido reported following splat and provided a patch. [ 122.221814] BUG: using smp_processor_id() in preemptible [00000000] code: sshd/2672 [ 122.221845] caller is debug_smp_processor_id+0x17/0x20 [ 122.221866] CPU: 0 PID: 2672 Comm: sshd Not tainted 4.14.0-rc3-idosch-next-custom #639 [ 122.221880] Hardware name: Mellanox Technologies Ltd. MSN2100-CB2FO/SA001017, BIOS 5.6.5 06/07/2016 [ 122.221893] Call Trace: [ 122.221919] dump_stack+0xb1/0x10c [ 122.221946] ? _atomic_dec_and_lock+0x124/0x124 [ 122.221974] ? ___ratelimit+0xfe/0x240 [ 122.222020] check_preemption_disabled+0x173/0x1b0 [ 122.222060] debug_smp_processor_id+0x17/0x20 [ 122.222083] ip6_pol_route+0x1482/0x24a0 ... I believe we can simplify this code path a bit, since we no longer hold a read_lock and need to release it to avoid a dead lock. By disabling BH, we make sure we'll prevent code re-entry and rt6_get_pcpu_route()/rt6_make_pcpu_route() run on the same cpu. Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Ido Schimmel Signed-off-by: Eric Dumazet Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/route.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 399d1bceec4a..606e80325b21 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1136,15 +1136,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) dst_hold(&pcpu_rt->dst); p = this_cpu_ptr(rt->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); - if (prev) { - /* If someone did it before us, return prev instead */ - /* release refcnt taken by ip6_rt_pcpu_alloc() */ - dst_release_immediate(&pcpu_rt->dst); - /* release refcnt taken by above dst_hold() */ - dst_release_immediate(&pcpu_rt->dst); - dst_hold(&prev->dst); - pcpu_rt = prev; - } + BUG_ON(prev); rt6_dst_from_metrics_check(pcpu_rt); return pcpu_rt; @@ -1739,31 +1731,25 @@ uncached_rt_out: struct rt6_info *pcpu_rt; dst_use_noref(&rt->dst, jiffies); + local_bh_disable(); pcpu_rt = rt6_get_pcpu_route(rt); - if (pcpu_rt) { - rcu_read_unlock(); - } else { + if (!pcpu_rt) { /* atomic_inc_not_zero() is needed when using rcu */ if (atomic_inc_not_zero(&rt->rt6i_ref)) { - /* We have to do the read_unlock first - * because rt6_make_pcpu_route() may trigger - * ip6_dst_gc() which will take the write_lock. - * - * No dst_hold() on rt is needed because grabbing + /* No dst_hold() on rt is needed because grabbing * rt->rt6i_ref makes sure rt can't be released. */ - rcu_read_unlock(); pcpu_rt = rt6_make_pcpu_route(rt); rt6_release(rt); } else { /* rt is already removed from tree */ - rcu_read_unlock(); pcpu_rt = net->ipv6.ip6_null_entry; dst_hold(&pcpu_rt->dst); } } - + local_bh_enable(); + rcu_read_unlock(); trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); return pcpu_rt; } -- cgit v1.2.3 From 821f1b21cabb46827ce39ddf82e2789680b5042a Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 6 Oct 2017 22:12:37 -0700 Subject: bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd flood This patch adds a new bridge port flag BR_NEIGH_SUPPRESS to suppress arp and nd flood on bridge ports. It implements rfc7432, section 10. https://tools.ietf.org/html/rfc7432#section-10 for ethernet VPN deployments. It is similar to the existing BR_PROXYARP* flags but has a few semantic differences to conform to EVPN standard. Unlike the existing flags, this new flag suppresses flood of all neigh discovery packets (arp and nd) to tunnel ports. Supports both vlan filtering and non-vlan filtering bridges. In case of EVPN, it is mainly used to avoid flooding of arp and nd packets to tunnel ports like vxlan. This patch adds netlink and sysfs support to set this bridge port flag. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/Makefile | 2 +- net/bridge/br_arp_nd_proxy.c | 32 ++++++++++++++++++++++++++++++++ net/bridge/br_forward.c | 2 +- net/bridge/br_if.c | 5 +++++ net/bridge/br_netlink.c | 10 +++++++++- net/bridge/br_private.h | 2 ++ net/bridge/br_sysfs_if.c | 2 ++ 7 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 net/bridge/br_arp_nd_proxy.c (limited to 'net') diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 40b1ede527ca..4aee55fdcc92 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_stp.o br_stp_bpdu.o \ br_stp_if.o br_stp_timer.o br_netlink.o \ - br_netlink_tunnel.o + br_netlink_tunnel.o br_arp_nd_proxy.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c new file mode 100644 index 000000000000..f889ad5f0048 --- /dev/null +++ b/net/bridge/br_arp_nd_proxy.c @@ -0,0 +1,32 @@ +/* + * Handle bridge arp/nd proxy/suppress + * + * Copyright (C) 2017 Cumulus Networks + * Copyright (c) 2017 Roopa Prabhu + * + * Authors: + * Roopa Prabhu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include "br_private.h" + +void br_recalculate_neigh_suppress_enabled(struct net_bridge *br) +{ + struct net_bridge_port *p; + bool neigh_suppress = false; + + list_for_each_entry(p, &br->port_list, list) { + if (p->flags & BR_NEIGH_SUPPRESS) { + neigh_suppress = true; + break; + } + } + + br->neigh_suppress_enabled = neigh_suppress; +} diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 48fb17417fac..b4eed113d2ec 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -204,7 +204,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) continue; - if ((p->flags & BR_PROXYARP_WIFI) && + if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) && BR_INPUT_SKB_CB(skb)->proxyarp_replied) continue; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 59a74a414e20..ae38547bbf91 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -310,6 +310,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) del_nbp(p); } + br_recalculate_neigh_suppress_enabled(br); + br_fdb_delete_by_port(br, NULL, 0, 1); cancel_delayed_work_sync(&br->gc_work); @@ -660,4 +662,7 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) if (mask & BR_AUTO_MASK) nbp_update_port_count(br); + + if (mask & BR_NEIGH_SUPPRESS) + br_recalculate_neigh_suppress_enabled(br); } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index dea88a255d26..f0e82682e071 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -138,6 +138,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -210,7 +211,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & BR_VLAN_TUNNEL)) || - nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask)) + nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || + nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, + !!(p->flags & BR_NEIGH_SUPPRESS))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -785,6 +788,11 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) p->group_fwd_mask = fwd_mask; } + err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, + BR_NEIGH_SUPPRESS); + if (err) + return err; + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ab4df24f7bba..00fa371b1fb2 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -404,6 +404,7 @@ struct net_bridge { #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif + bool neigh_suppress_enabled; }; struct br_input_skb_cb { @@ -1139,4 +1140,5 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb) } #endif /* CONFIG_NET_SWITCHDEV */ +void br_recalculate_neigh_suppress_enabled(struct net_bridge *br); #endif diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 9110d5e56085..0a1fa9ccd8b7 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -191,6 +191,7 @@ BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); +BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -241,6 +242,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_multicast_flood, &brport_attr_broadcast_flood, &brport_attr_group_fwd_mask, + &brport_attr_neigh_suppress, NULL }; -- cgit v1.2.3 From 057658cb33fbf4d4309f01fe8845903b1cd07fad Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 6 Oct 2017 22:12:38 -0700 Subject: bridge: suppress arp pkts on BR_NEIGH_SUPPRESS ports This patch avoids flooding and proxies arp packets for BR_NEIGH_SUPPRESS ports. Moves existing br_do_proxy_arp to br_do_proxy_suppress_arp to support both proxy arp and neigh suppress. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_arp_nd_proxy.c | 188 +++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_device.c | 9 +++ net/bridge/br_input.c | 63 ++------------- net/bridge/br_private.h | 3 + 4 files changed, 205 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index f889ad5f0048..a79c1824e163 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -14,6 +14,14 @@ */ #include +#include +#include +#include +#include +#include +#include +#include + #include "br_private.h" void br_recalculate_neigh_suppress_enabled(struct net_bridge *br) @@ -30,3 +38,183 @@ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br) br->neigh_suppress_enabled = neigh_suppress; } + +#if IS_ENABLED(CONFIG_INET) +static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p, + struct net_device *dev, __be32 dest_ip, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw, + __be16 vlan_proto, u16 vlan_tci) +{ + struct net_bridge_vlan_group *vg; + struct sk_buff *skb; + u16 pvid; + + netdev_dbg(dev, "arp send dev %s dst %pI4 dst_hw %pM src %pI4 src_hw %pM\n", + dev->name, &dest_ip, dest_hw, &src_ip, src_hw); + + if (!vlan_tci) { + arp_send(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + return; + } + + skb = arp_create(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (!skb) + return; + + if (p) + vg = nbp_vlan_group_rcu(p); + else + vg = br_vlan_group_rcu(br); + pvid = br_get_pvid(vg); + if (pvid == (vlan_tci & VLAN_VID_MASK)) + vlan_tci = 0; + + if (vlan_tci) + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + + if (p) { + arp_xmit(skb); + } else { + skb_reset_mac_header(skb); + __skb_pull(skb, skb_network_offset(skb)); + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->pkt_type = PACKET_HOST; + + netif_rx_ni(skb); + } +} + +static int br_chk_addr_ip(struct net_device *dev, void *data) +{ + __be32 ip = *(__be32 *)data; + struct in_device *in_dev; + __be32 addr = 0; + + in_dev = __in_dev_get_rcu(dev); + if (in_dev) + addr = inet_confirm_addr(dev_net(dev), in_dev, 0, ip, + RT_SCOPE_HOST); + + if (addr == ip) + return 1; + + return 0; +} + +static bool br_is_local_ip(struct net_device *dev, __be32 ip) +{ + if (br_chk_addr_ip(dev, &ip)) + return true; + + /* check if ip is configured on upper dev */ + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &ip)) + return true; + + return false; +} + +void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p) +{ + struct net_device *dev = br->dev; + struct net_device *vlandev = dev; + struct neighbour *n; + struct arphdr *parp; + u8 *arpptr, *sha; + __be32 sip, tip; + + BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + + if ((dev->flags & IFF_NOARP) || + !pskb_may_pull(skb, arp_hdr_len(dev))) + return; + + parp = arp_hdr(skb); + + if (parp->ar_pro != htons(ETH_P_IP) || + parp->ar_hln != dev->addr_len || + parp->ar_pln != 4) + return; + + arpptr = (u8 *)parp + sizeof(struct arphdr); + sha = arpptr; + arpptr += dev->addr_len; /* sha */ + memcpy(&sip, arpptr, sizeof(sip)); + arpptr += sizeof(sip); + arpptr += dev->addr_len; /* tha */ + memcpy(&tip, arpptr, sizeof(tip)); + + if (ipv4_is_loopback(tip) || + ipv4_is_multicast(tip)) + return; + + if (br->neigh_suppress_enabled) { + if (p && (p->flags & BR_NEIGH_SUPPRESS)) + return; + if (ipv4_is_zeronet(sip) || sip == tip) { + /* prevent flooding to neigh suppress ports */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + } + + if (parp->ar_op != htons(ARPOP_REQUEST)) + return; + + if (vid != 0) { + vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto, + vid); + if (!vlandev) + return; + } + + if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) { + /* its our local ip, so don't proxy reply + * and don't forward to neigh suppress ports + */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + n = neigh_lookup(&arp_tbl, &tip, vlandev); + if (n) { + struct net_bridge_fdb_entry *f; + + if (!(n->nud_state & NUD_VALID)) { + neigh_release(n); + return; + } + + f = br_fdb_find_rcu(br, n->ha, vid); + if (f) { + bool replied = false; + + if ((p && (p->flags & BR_PROXYARP)) || + (f->dst && (f->dst->flags & (BR_PROXYARP_WIFI | + BR_NEIGH_SUPPRESS)))) { + if (!vid) + br_arp_send(br, p, skb->dev, sip, tip, + sha, n->ha, sha, 0, 0); + else + br_arp_send(br, p, skb->dev, sip, tip, + sha, n->ha, sha, + skb->vlan_proto, + skb_vlan_tag_get(skb)); + replied = true; + } + + /* If we have replied or as long as we know the + * mac, indicate to arp replied + */ + if (replied || br->neigh_suppress_enabled) + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + } + + neigh_release(n); + } +} +#endif diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 7acb77c9bd65..eb30c6a274c3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -39,6 +39,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; const unsigned char *dest; + struct ethhdr *eth; u16 vid = 0; rcu_read_lock(); @@ -57,11 +58,19 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) BR_INPUT_SKB_CB(skb)->brdev = dev; skb_reset_mac_header(skb); + eth = eth_hdr(skb); skb_pull(skb, ETH_HLEN); if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) goto out; + if (IS_ENABLED(CONFIG_INET) && + (eth->h_proto == htons(ETH_P_ARP) || + eth->h_proto == htons(ETH_P_RARP)) && + br->neigh_suppress_enabled) { + br_do_proxy_suppress_arp(skb, br, vid, NULL); + } + dest = eth_hdr(skb)->h_dest; if (is_broadcast_ether_addr(dest)) { br_flood(br, skb, BR_PKT_BROADCAST, false, true); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7cb613776b31..4b8d2ec2fa23 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -71,62 +71,6 @@ static int br_pass_frame_up(struct sk_buff *skb) br_netif_receive_skb); } -static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, - u16 vid, struct net_bridge_port *p) -{ - struct net_device *dev = br->dev; - struct neighbour *n; - struct arphdr *parp; - u8 *arpptr, *sha; - __be32 sip, tip; - - BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; - - if ((dev->flags & IFF_NOARP) || - !pskb_may_pull(skb, arp_hdr_len(dev))) - return; - - parp = arp_hdr(skb); - - if (parp->ar_pro != htons(ETH_P_IP) || - parp->ar_op != htons(ARPOP_REQUEST) || - parp->ar_hln != dev->addr_len || - parp->ar_pln != 4) - return; - - arpptr = (u8 *)parp + sizeof(struct arphdr); - sha = arpptr; - arpptr += dev->addr_len; /* sha */ - memcpy(&sip, arpptr, sizeof(sip)); - arpptr += sizeof(sip); - arpptr += dev->addr_len; /* tha */ - memcpy(&tip, arpptr, sizeof(tip)); - - if (ipv4_is_loopback(tip) || - ipv4_is_multicast(tip)) - return; - - n = neigh_lookup(&arp_tbl, &tip, dev); - if (n) { - struct net_bridge_fdb_entry *f; - - if (!(n->nud_state & NUD_VALID)) { - neigh_release(n); - return; - } - - f = br_fdb_find_rcu(br, n->ha, vid); - if (f && ((p->flags & BR_PROXYARP) || - (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, - sha, n->ha, sha); - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; - } - - neigh_release(n); - } -} - /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -171,8 +115,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb BR_INPUT_SKB_CB(skb)->brdev = br->dev; - if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) - br_do_proxy_arp(skb, br, vid, p); + if (IS_ENABLED(CONFIG_INET) && + (skb->protocol == htons(ETH_P_ARP) || + skb->protocol == htons(ETH_P_RARP))) { + br_do_proxy_suppress_arp(skb, br, vid, p); + } switch (pkt_type) { case BR_PKT_MULTICAST: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 00fa371b1fb2..4e6b25be14d0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1140,5 +1140,8 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb) } #endif /* CONFIG_NET_SWITCHDEV */ +/* br_arp_nd_proxy.c */ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br); +void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p); #endif -- cgit v1.2.3 From ed842faeb2bd49256f00485402f3113205f91d30 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 6 Oct 2017 22:12:39 -0700 Subject: bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports This patch avoids flooding and proxies ndisc packets for BR_NEIGH_SUPPRESS ports. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_arp_nd_proxy.c | 249 +++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_device.c | 11 ++ net/bridge/br_input.c | 11 ++ net/bridge/br_private.h | 3 + 4 files changed, 274 insertions(+) (limited to 'net') diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index a79c1824e163..2cf7716254be 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -21,6 +21,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif #include "br_private.h" @@ -218,3 +221,249 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, } } #endif + +#if IS_ENABLED(CONFIG_IPV6) +struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg) +{ + struct nd_msg *m; + + m = skb_header_pointer(skb, skb_network_offset(skb) + + sizeof(struct ipv6hdr), sizeof(*msg), msg); + if (!m) + return NULL; + + if (m->icmph.icmp6_code != 0 || + (m->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION && + m->icmph.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)) + return NULL; + + return m; +} + +static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, + struct sk_buff *request, struct neighbour *n, + __be16 vlan_proto, u16 vlan_tci, struct nd_msg *ns) +{ + struct net_device *dev = request->dev; + struct net_bridge_vlan_group *vg; + struct sk_buff *reply; + struct nd_msg *na; + struct ipv6hdr *pip6; + int na_olen = 8; /* opt hdr + ETH_ALEN for target */ + int ns_olen; + int i, len; + u8 *daddr; + u16 pvid; + + if (!dev) + return; + + len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + + sizeof(*na) + na_olen + dev->needed_tailroom; + + reply = alloc_skb(len, GFP_ATOMIC); + if (!reply) + return; + + reply->protocol = htons(ETH_P_IPV6); + reply->dev = dev; + skb_reserve(reply, LL_RESERVED_SPACE(dev)); + skb_push(reply, sizeof(struct ethhdr)); + skb_set_mac_header(reply, 0); + + daddr = eth_hdr(request)->h_source; + + /* Do we need option processing ? */ + ns_olen = request->len - (skb_network_offset(request) + + sizeof(struct ipv6hdr)) - sizeof(*ns); + for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) { + if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { + daddr = ns->opt + i + sizeof(struct nd_opt_hdr); + break; + } + } + + /* Ethernet header */ + ether_addr_copy(eth_hdr(reply)->h_dest, daddr); + ether_addr_copy(eth_hdr(reply)->h_source, n->ha); + eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); + reply->protocol = htons(ETH_P_IPV6); + + skb_pull(reply, sizeof(struct ethhdr)); + skb_set_network_header(reply, 0); + skb_put(reply, sizeof(struct ipv6hdr)); + + /* IPv6 header */ + pip6 = ipv6_hdr(reply); + memset(pip6, 0, sizeof(struct ipv6hdr)); + pip6->version = 6; + pip6->priority = ipv6_hdr(request)->priority; + pip6->nexthdr = IPPROTO_ICMPV6; + pip6->hop_limit = 255; + pip6->daddr = ipv6_hdr(request)->saddr; + pip6->saddr = *(struct in6_addr *)n->primary_key; + + skb_pull(reply, sizeof(struct ipv6hdr)); + skb_set_transport_header(reply, 0); + + na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen); + + /* Neighbor Advertisement */ + memset(na, 0, sizeof(*na) + na_olen); + na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; + na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */ + na->icmph.icmp6_override = 1; + na->icmph.icmp6_solicited = 1; + na->target = ns->target; + ether_addr_copy(&na->opt[2], n->ha); + na->opt[0] = ND_OPT_TARGET_LL_ADDR; + na->opt[1] = na_olen >> 3; + + na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, + &pip6->daddr, + sizeof(*na) + na_olen, + IPPROTO_ICMPV6, + csum_partial(na, sizeof(*na) + na_olen, 0)); + + pip6->payload_len = htons(sizeof(*na) + na_olen); + + skb_push(reply, sizeof(struct ipv6hdr)); + skb_push(reply, sizeof(struct ethhdr)); + + reply->ip_summed = CHECKSUM_UNNECESSARY; + + if (p) + vg = nbp_vlan_group_rcu(p); + else + vg = br_vlan_group_rcu(br); + pvid = br_get_pvid(vg); + if (pvid == (vlan_tci & VLAN_VID_MASK)) + vlan_tci = 0; + + if (vlan_tci) + __vlan_hwaccel_put_tag(reply, vlan_proto, vlan_tci); + + netdev_dbg(dev, "nd send dev %s dst %pI6 dst_hw %pM src %pI6 src_hw %pM\n", + dev->name, &pip6->daddr, daddr, &pip6->saddr, n->ha); + + if (p) { + dev_queue_xmit(reply); + } else { + skb_reset_mac_header(reply); + __skb_pull(reply, skb_network_offset(reply)); + reply->ip_summed = CHECKSUM_UNNECESSARY; + reply->pkt_type = PACKET_HOST; + + netif_rx_ni(reply); + } +} + +static int br_chk_addr_ip6(struct net_device *dev, void *data) +{ + struct in6_addr *addr = (struct in6_addr *)data; + + if (ipv6_chk_addr(dev_net(dev), addr, dev, 0)) + return 1; + + return 0; +} + +static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr) + +{ + if (br_chk_addr_ip6(dev, addr)) + return true; + + /* check if ip is configured on upper dev */ + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, addr)) + return true; + + return false; +} + +void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p, struct nd_msg *msg) +{ + struct net_device *dev = br->dev; + struct net_device *vlandev = NULL; + struct in6_addr *saddr, *daddr; + struct ipv6hdr *iphdr; + struct neighbour *n; + + BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + + if (p && (p->flags & BR_NEIGH_SUPPRESS)) + return; + + if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && + !msg->icmph.icmp6_solicited) { + /* prevent flooding to neigh suppress ports */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) + return; + + iphdr = ipv6_hdr(skb); + saddr = &iphdr->saddr; + daddr = &iphdr->daddr; + + if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) { + /* prevent flooding to neigh suppress ports */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + if (vid != 0) { + /* build neigh table lookup on the vlan device */ + vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto, + vid); + if (!vlandev) + return; + } else { + vlandev = dev; + } + + if (br_is_local_ip6(vlandev, &msg->target)) { + /* its our own ip, so don't proxy reply + * and don't forward to arp suppress ports + */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, vlandev); + if (n) { + struct net_bridge_fdb_entry *f; + + if (!(n->nud_state & NUD_VALID)) { + neigh_release(n); + return; + } + + f = br_fdb_find_rcu(br, n->ha, vid); + if (f) { + bool replied = false; + + if (f->dst && (f->dst->flags & BR_NEIGH_SUPPRESS)) { + if (vid != 0) + br_nd_send(br, p, skb, n, + skb->vlan_proto, + skb_vlan_tag_get(skb), msg); + else + br_nd_send(br, p, skb, n, 0, 0, msg); + replied = true; + } + + /* If we have replied or as long as we know the + * mac, indicate to NEIGH_SUPPRESS ports that we + * have replied + */ + if (replied || br->neigh_suppress_enabled) + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + } + neigh_release(n); + } +} +#endif diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index eb30c6a274c3..28bb22186fa0 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -69,6 +69,17 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) eth->h_proto == htons(ETH_P_RARP)) && br->neigh_suppress_enabled) { br_do_proxy_suppress_arp(skb, br, vid, NULL); + } else if (IS_ENABLED(CONFIG_IPV6) && + skb->protocol == htons(ETH_P_IPV6) && + br->neigh_suppress_enabled && + pskb_may_pull(skb, sizeof(struct ipv6hdr) + + sizeof(struct nd_msg)) && + ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { + struct nd_msg *msg, _msg; + + msg = br_is_nd_neigh_msg(skb, &_msg); + if (msg) + br_do_suppress_nd(skb, br, vid, NULL, msg); } dest = eth_hdr(skb)->h_dest; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 4b8d2ec2fa23..a096d3e189da 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -119,6 +119,17 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb (skb->protocol == htons(ETH_P_ARP) || skb->protocol == htons(ETH_P_RARP))) { br_do_proxy_suppress_arp(skb, br, vid, p); + } else if (IS_ENABLED(CONFIG_IPV6) && + skb->protocol == htons(ETH_P_IPV6) && + br->neigh_suppress_enabled && + pskb_may_pull(skb, sizeof(struct ipv6hdr) + + sizeof(struct nd_msg)) && + ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { + struct nd_msg *msg, _msg; + + msg = br_is_nd_neigh_msg(skb, &_msg); + if (msg) + br_do_suppress_nd(skb, br, vid, p, msg); } switch (pkt_type) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4e6b25be14d0..fa0039f44818 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1144,4 +1144,7 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb) void br_recalculate_neigh_suppress_enabled(struct net_bridge *br); void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p); +void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p, struct nd_msg *msg); +struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m); #endif -- cgit v1.2.3 From 8ef802aa8ec87a0fec20d28f8f51cad964265902 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:23 -0700 Subject: ipv6: prepare RCU lookups for idev->addr_list inet6_ifa_finish_destroy() already uses kfree_rcu() to free inet6_ifaddr structs. We need to use proper list additions/deletions in order to allow readers to use RCU instead of idev->lock rwlock. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9854d93e45bb..d1ff0955b709 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -945,7 +945,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) break; } - list_add_tail(&ifp->if_list, p); + list_add_tail_rcu(&ifp->if_list, p); } static u32 inet6_addr_hash(const struct in6_addr *addr) @@ -1204,7 +1204,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) action = check_cleanup_prefix_route(ifp, &expires); - list_del_init(&ifp->if_list); + list_del_rcu(&ifp->if_list); __in6_ifa_put(ifp); write_unlock_bh(&ifp->idev->lock); @@ -3562,7 +3562,6 @@ static int addrconf_ifdown(struct net_device *dev, int how) struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; - struct list_head del_list; int _keep_addr; bool keep_addr; int state, i; @@ -3654,7 +3653,6 @@ restart: */ keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6); - INIT_LIST_HEAD(&del_list); list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { struct rt6_info *rt = NULL; bool keep; @@ -3663,8 +3661,6 @@ restart: keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) && !addr_is_local(&ifa->addr); - if (!keep) - list_move(&ifa->if_list, &del_list); write_unlock_bh(&idev->lock); spin_lock_bh(&ifa->lock); @@ -3698,19 +3694,14 @@ restart: } write_lock_bh(&idev->lock); + if (!keep) { + list_del_rcu(&ifa->if_list); + in6_ifa_put(ifa); + } } write_unlock_bh(&idev->lock); - /* now clean up addresses to be removed */ - while (!list_empty(&del_list)) { - ifa = list_first_entry(&del_list, - struct inet6_ifaddr, if_list); - list_del(&ifa->if_list); - - in6_ifa_put(ifa); - } - /* Step 5: Discard anycast and multicast list */ if (how) { ipv6_ac_destroy_dev(idev); -- cgit v1.2.3 From d9bf82c2f61be534873a37a0631b58761b6d6506 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:24 -0700 Subject: ipv6: ipv6_count_addresses() rcu conversion Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d1ff0955b709..2e029c8be1f2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -152,7 +152,7 @@ static void ipv6_regen_rndid(struct inet6_dev *idev); static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); -static int ipv6_count_addresses(struct inet6_dev *idev); +static int ipv6_count_addresses(const struct inet6_dev *idev); static int ipv6_generate_stable_address(struct in6_addr *addr, u8 dad_count, const struct inet6_dev *idev); @@ -1785,15 +1785,15 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, return err; } -static int ipv6_count_addresses(struct inet6_dev *idev) +static int ipv6_count_addresses(const struct inet6_dev *idev) { + const struct inet6_ifaddr *ifp; int cnt = 0; - struct inet6_ifaddr *ifp; - read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) + rcu_read_lock(); + list_for_each_entry_rcu(ifp, &idev->addr_list, if_list) cnt++; - read_unlock_bh(&idev->lock); + rcu_read_unlock(); return cnt; } -- cgit v1.2.3 From 47e26941f78d6ec03c5dd547425409f1f7852ff5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:25 -0700 Subject: ipv6: ipv6_chk_custom_prefix() rcu conversion Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2e029c8be1f2..33ee84c2512b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1859,20 +1859,18 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev) { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; + const struct inet6_dev *idev; bool ret = false; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - list_for_each_entry(ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len); if (ret) break; } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); -- cgit v1.2.3 From 24ba333b2c1a609fc17214ceb215e19bd9d54951 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:26 -0700 Subject: ipv6: ipv6_chk_prefix() rcu conversion Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 33ee84c2512b..ea63442209bf 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1880,22 +1880,20 @@ EXPORT_SYMBOL(ipv6_chk_custom_prefix); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; + const struct inet6_dev *idev; int onlink; onlink = 0; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - list_for_each_entry(ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { onlink = ipv6_prefix_equal(addr, &ifa->addr, ifa->prefix_len); if (onlink) break; } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); return onlink; -- cgit v1.2.3 From f59c031e9134bb16c38980196a73d7ba40979baa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:27 -0700 Subject: ipv6: __ipv6_dev_get_saddr() rcu conversion Callers hold rcu_read_lock(), so we do not need the rcu_read_lock()/rcu_read_unlock() pair. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ea63442209bf..20c3ca777529 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1558,8 +1558,7 @@ static int __ipv6_dev_get_saddr(struct net *net, { struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx]; - read_lock_bh(&idev->lock); - list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(score->ifa, &idev->addr_list, if_list) { int i; /* @@ -1625,7 +1624,6 @@ static int __ipv6_dev_get_saddr(struct net *net, } } out: - read_unlock_bh(&idev->lock); return hiscore_idx; } -- cgit v1.2.3 From cc429c8f6fa7cc1c9109bb4a1ad63a6d1f232f0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:28 -0700 Subject: ipv6: avoid cache line dirtying in ipv6_dev_get_saddr() By extending the rcu section a bit, we can avoid these very expensive in6_ifa_put()/in6_ifa_hold() calls done in __ipv6_dev_get_saddr() and ipv6_dev_get_saddr() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 20c3ca777529..cab3faad2bf1 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1608,11 +1608,6 @@ static int __ipv6_dev_get_saddr(struct net *net, } break; } else if (minihiscore < miniscore) { - if (hiscore->ifa) - in6_ifa_put(hiscore->ifa); - - in6_ifa_hold(score->ifa); - swap(hiscore, score); hiscore_idx = 1 - hiscore_idx; @@ -1660,6 +1655,7 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, int dst_type; bool use_oif_addr = false; int hiscore_idx = 0; + int ret = 0; dst_type = __ipv6_addr_type(daddr); dst.addr = daddr; @@ -1735,15 +1731,14 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, } out: - rcu_read_unlock(); - hiscore = &scores[hiscore_idx]; if (!hiscore->ifa) - return -EADDRNOTAVAIL; + ret = -EADDRNOTAVAIL; + else + *saddr = hiscore->ifa->addr; - *saddr = hiscore->ifa->addr; - in6_ifa_put(hiscore->ifa); - return 0; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ipv6_dev_get_saddr); -- cgit v1.2.3 From 77041420751fe6d4acf2103b245dcc2b4b7b8360 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Mon, 9 Oct 2017 11:15:31 +0200 Subject: net: bridge: Notify on bridge device mrouter state changes Add the SWITCHDEV_ATTR_ID_BRIDGE_MROUTER switchdev notification type, used to indicate whether the bridge is or isn't mrouter. Notify when the bridge changes its state, similarly to the already existing bridged port mrouter notifications. The notification uses the switchdev_attr.u.mrouter boolean flag to indicate the current bridge mrouter status. Thus, it only indicates whether the bridge is currently used as an mrouter or not, and does not indicate the exact mrouter state of the bridge (learning, permanent, etc.). Signed-off-by: Yotam Gigi Signed-off-by: Jiri Pirko Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 8dc5c8d69bcd..bd50550dd4ca 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -859,8 +859,32 @@ out: spin_unlock(&br->multicast_lock); } +static void br_mc_router_state_change(struct net_bridge *p, + bool is_mc_router) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, + .flags = SWITCHDEV_F_DEFER, + .u.mrouter = is_mc_router, + }; + + switchdev_port_attr_set(p->dev, &attr); +} + static void br_multicast_local_router_expired(unsigned long data) { + struct net_bridge *br = (struct net_bridge *)data; + + spin_lock(&br->multicast_lock); + if (br->multicast_router == MDB_RTR_TYPE_DISABLED || + br->multicast_router == MDB_RTR_TYPE_PERM || + timer_pending(&br->multicast_router_timer)) + goto out; + + br_mc_router_state_change(br, false); +out: + spin_unlock(&br->multicast_lock); } static void br_multicast_querier_expired(struct net_bridge *br, @@ -1364,9 +1388,12 @@ static void br_multicast_mark_router(struct net_bridge *br, unsigned long now = jiffies; if (!port) { - if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) + if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { + if (!timer_pending(&br->multicast_router_timer)) + br_mc_router_state_change(br, true); mod_timer(&br->multicast_router_timer, now + br->multicast_querier_interval); + } return; } @@ -1952,7 +1979,7 @@ void br_multicast_init(struct net_bridge *br) spin_lock_init(&br->multicast_lock); setup_timer(&br->multicast_router_timer, - br_multicast_local_router_expired, 0); + br_multicast_local_router_expired, (unsigned long)br); setup_timer(&br->ip4_other_query.timer, br_ip4_multicast_querier_expired, (unsigned long)br); setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired, @@ -2042,9 +2069,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) switch (val) { case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: + br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM); del_timer(&br->multicast_router_timer); - /* fall through */ + br->multicast_router = val; + err = 0; + break; case MDB_RTR_TYPE_TEMP_QUERY: + if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) + br_mc_router_state_change(br, false); br->multicast_router = val; err = 0; break; -- cgit v1.2.3 From 0912bda436388a02c72164b4b490b578e64c012e Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Mon, 9 Oct 2017 11:15:32 +0200 Subject: net: bridge: Export bridge multicast router state Add an access function that, given a bridge netdevice, returns whether the bridge device is currently an mrouter or not. The function uses the already existing br_multicast_is_router function to check that. This function is needed in order to allow ports that join an already existing bridge to know the current mrouter state of the bridge device. Together with the bridge device mrouter ports switchdev notifications, it is possible to have full offloading of the semantics of the bridge device mcast router state. Due to the fact that the bridge multicast router status can change in packet RX path, take the multicast_router bridge spinlock to protect the read. Signed-off-by: Yotam Gigi Reviewed-by: Nogah Frankel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index bd50550dd4ca..7947e0436e18 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2216,6 +2216,18 @@ bool br_multicast_enabled(const struct net_device *dev) } EXPORT_SYMBOL_GPL(br_multicast_enabled); +bool br_multicast_router(const struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + bool is_router; + + spin_lock_bh(&br->multicast_lock); + is_router = br_multicast_is_router(br); + spin_unlock_bh(&br->multicast_lock); + return is_router; +} +EXPORT_SYMBOL_GPL(br_multicast_router); + int br_multicast_set_querier(struct net_bridge *br, unsigned long val) { unsigned long max_delay; -- cgit v1.2.3 From bfd8e5a407133e58a92a38ccf3d0ba6db81f22d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Oct 2017 06:01:37 -0700 Subject: ipv6: avoid zeroing per cpu data again per cpu allocations are already zeroed, no need to clear them again. Fixes: d52d3997f843f ("ipv6: Create percpu rt6_info") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Cc: Tejun Heo Acked-by: Tejun Heo Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 606e80325b21..3d7d4e09301e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -377,17 +377,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, if (rt) { rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); - if (rt->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **p; - - p = per_cpu_ptr(rt->rt6i_pcpu, cpu); - /* no one shares rt */ - *p = NULL; - } - } else { + if (!rt->rt6i_pcpu) { dst_release_immediate(&rt->dst); return NULL; } -- cgit v1.2.3 From 66c77ff3a049a1d182c9c78cc6286bb1ceee70db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Oct 2017 09:52:24 -0700 Subject: ipv6: addrlabel: rework ip6addrlbl_get() If we allocate skb before the lookup, we can use RCU without the need of ip6addrlbl_hold() This means that the following patch can get rid of refcounting. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index c6311d7108f6..f31489652c4a 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -546,38 +546,28 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return -EINVAL; addr = nla_data(tb[IFAL_ADDRESS]); - rcu_read_lock(); - p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - if (p && !ip6addrlbl_hold(p)) - p = NULL; - lseq = net->ipv6.ip6addrlbl_table.seq; - rcu_read_unlock(); - - if (!p) { - err = -ESRCH; - goto out; - } - skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL); - if (!skb) { - ip6addrlbl_put(p); + if (!skb) return -ENOBUFS; - } - err = ip6addrlbl_fill(skb, p, lseq, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - RTM_NEWADDRLABEL, 0); + err = -ESRCH; - ip6addrlbl_put(p); + rcu_read_lock(); + p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); + lseq = net->ipv6.ip6addrlbl_table.seq; + if (p) + err = ip6addrlbl_fill(skb, p, lseq, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, + RTM_NEWADDRLABEL, 0); + rcu_read_unlock(); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); - goto out; + } else { + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } - - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); -out: return err; } -- cgit v1.2.3 From 2809c0957dbbd56d3a92e9ede27b19943fef7236 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Oct 2017 09:52:25 -0700 Subject: ipv6: addrlabel: remove refcounting After previous patch ("ipv6: addrlabel: rework ip6addrlbl_get()") we can remove the refcount from struct ip6addrlbl_entry, since it is no longer elevated in p6addrlbl_get() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index f31489652c4a..2606d2fa3ac4 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -18,7 +18,6 @@ #include #include #include -#include #if 0 #define ADDRLABEL(x...) printk(x) @@ -36,7 +35,6 @@ struct ip6addrlbl_entry { int addrtype; u32 label; struct hlist_node list; - refcount_t refcnt; struct rcu_head rcu; }; @@ -111,28 +109,6 @@ static const __net_initconst struct ip6addrlbl_init_table } }; -/* Object management */ -static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) -{ - kfree(p); -} - -static void ip6addrlbl_free_rcu(struct rcu_head *h) -{ - ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu)); -} - -static bool ip6addrlbl_hold(struct ip6addrlbl_entry *p) -{ - return refcount_inc_not_zero(&p->refcnt); -} - -static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) -{ - if (refcount_dec_and_test(&p->refcnt)) - call_rcu(&p->rcu, ip6addrlbl_free_rcu); -} - /* Find label */ static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, const struct in6_addr *addr, @@ -219,7 +195,6 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); - refcount_set(&newp->refcnt, 1); return newp; } @@ -243,7 +218,7 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, goto out; } hlist_replace_rcu(&p->list, &newp->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); goto out; } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || (p->prefixlen < newp->prefixlen)) { @@ -281,7 +256,7 @@ static int ip6addrlbl_add(struct net *net, ret = __ip6addrlbl_add(net, newp, replace); spin_unlock(&net->ipv6.ip6addrlbl_table.lock); if (ret) - ip6addrlbl_free(newp); + kfree(newp); return ret; } @@ -302,7 +277,7 @@ static int __ip6addrlbl_del(struct net *net, p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); ret = 0; break; } @@ -360,7 +335,7 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) spin_lock(&net->ipv6.ip6addrlbl_table.lock); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { hlist_del_rcu(&p->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); } spin_unlock(&net->ipv6.ip6addrlbl_table.lock); } -- cgit v1.2.3 From d0e60206bea2dec46c0a28fd6b116646aa67c5ae Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 9 Oct 2017 17:17:26 -0700 Subject: ipv6: use rcu_dereference_bh() in ipv6_route_seq_next() This patch replaces rcu_deference() with rcu_dereference_bh() in ipv6_route_seq_next() to avoid the following warning: [ 19.431685] WARNING: suspicious RCU usage [ 19.433451] 4.14.0-rc3-00914-g66f5d6c #118 Not tainted [ 19.435509] ----------------------------- [ 19.437267] net/ipv6/ip6_fib.c:2259 suspicious rcu_dereference_check() usage! [ 19.440790] [ 19.440790] other info that might help us debug this: [ 19.440790] [ 19.444734] [ 19.444734] rcu_scheduler_active = 2, debug_locks = 1 [ 19.447757] 2 locks held by odhcpd/3720: [ 19.449480] #0: (&p->lock){+.+.}, at: [] seq_read+0x3c/0x333 [ 19.452720] #1: (rcu_read_lock_bh){....}, at: [] ipv6_route_seq_start+0x5/0xfd [ 19.456323] [ 19.456323] stack backtrace: [ 19.458812] CPU: 0 PID: 3720 Comm: odhcpd Not tainted 4.14.0-rc3-00914-g66f5d6c #118 [ 19.462042] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 19.465414] Call Trace: [ 19.466788] dump_stack+0x86/0xc0 [ 19.468358] lockdep_rcu_suspicious+0xea/0xf3 [ 19.470183] ipv6_route_seq_next+0x71/0x164 [ 19.471963] seq_read+0x244/0x333 [ 19.473522] proc_reg_read+0x48/0x67 [ 19.475152] ? proc_reg_write+0x67/0x67 [ 19.476862] __vfs_read+0x26/0x10b [ 19.478463] ? __might_fault+0x37/0x84 [ 19.480148] vfs_read+0xba/0x146 [ 19.481690] SyS_read+0x51/0x8e [ 19.483197] do_int80_syscall_32+0x66/0x15a [ 19.484969] entry_INT80_compat+0x32/0x50 [ 19.486707] RIP: 0023:0xf7f0be8e [ 19.488244] RSP: 002b:00000000ffa75d04 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 [ 19.491431] RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 0000000008056068 [ 19.493886] RDX: 0000000000001000 RSI: 0000000008056008 RDI: 0000000000001000 [ 19.496331] RBP: 00000000000001ff R08: 0000000000000000 R09: 0000000000000000 [ 19.498768] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 19.501217] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Xiaolong Ye Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 52a29ba32928..c2ecd5ec638a 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2262,7 +2262,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference(((struct rt6_info *)v)->dst.rt6_next); + n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next); if (n) { ++*pos; return n; -- cgit v1.2.3 From ceaa001a170e43608854d5290a48064f57b565ed Mon Sep 17 00:00:00 2001 From: William Tu Date: Wed, 4 Oct 2017 17:03:12 -0700 Subject: openvswitch: Add erspan tunnel support. Add erspan netlink interface for OVS. Signed-off-by: William Tu Cc: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 51 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index e8eb427ce6d1..fc0ca9a89b8e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "flow_netlink.h" @@ -319,7 +320,8 @@ size_t ovs_tun_key_attr_size(void) * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ - + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ + + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_DST */ + + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ } size_t ovs_key_attr_size(void) @@ -371,6 +373,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] .next = ovs_vxlan_ext_key_lens }, [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -593,6 +596,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } +static int erspan_tun_opt_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + struct erspan_metadata opts; + + BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); + + memset(&opts, 0, sizeof(opts)); + opts.index = nla_get_be32(attr); + + /* Index has only 20-bit */ + if (ntohl(opts.index) & ~INDEX_MASK) { + OVS_NLERR(log, "ERSPAN index number %x too large.", + ntohl(opts.index)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask); + opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), + is_mask); + + return 0; +} + static int ip_tun_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) @@ -700,6 +730,19 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_PAD: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = erspan_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + tun_flags |= TUNNEL_ERSPAN_OPT; + opts_type = type; + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -824,6 +867,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, else if (output->tun_flags & TUNNEL_VXLAN_OPT && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; + else if (output->tun_flags & TUNNEL_ERSPAN_OPT && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + ((struct erspan_metadata *)tun_opts)->index)) + return -EMSGSIZE; } return 0; @@ -2195,6 +2242,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + break; } }; -- cgit v1.2.3 From b2427e671766c51379bc882fdf809d905c6f9619 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Oct 2017 18:01:16 +0100 Subject: ipv6: fix dereference of rt6_ex before null check error Currently rt6_ex is being dereferenced before it is null checked hence there is a possible null dereference bug. Fix this by only dereferencing rt6_ex after it has been null checked. Detected by CoverityScan, CID#1457749 ("Dereference before null check") Fixes: 81eb8447daae ("ipv6: take care of rt6_stats") Signed-off-by: Colin Ian King Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 13b0d6d1cc30..c1c44f484286 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1142,10 +1142,12 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { - struct net *net = dev_net(rt6_ex->rt6i->dst.dev); + struct net *net; if (!bucket || !rt6_ex) return; + + net = dev_net(rt6_ex->rt6i->dst.dev); rt6_ex->rt6i->rt6i_node = NULL; hlist_del_rcu(&rt6_ex->hlist); rt6_release(rt6_ex->rt6i); -- cgit v1.2.3 From 442d713baa33db0f78adadee6125c215f10f5a75 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Oct 2017 19:10:30 +0100 Subject: ipv6: fix incorrect bitwise operator used on rt6i_flags The use of the | operator always leads to true which looks rather suspect to me. Fix this by using & instead to just check the RTF_CACHE entry bit. Detected by CoverityScan, CID#1457734, #1457747 ("Wrong operator used") Fixes: 35732d01fe31 ("ipv6: introduce a hash table to store dst cache") Signed-off-by: Colin Ian King Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c1c44f484286..2e8842fa6450 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1415,7 +1415,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt) int err; if (!from || - !(rt->rt6i_flags | RTF_CACHE)) + !(rt->rt6i_flags & RTF_CACHE)) return -EINVAL; if (!rcu_access_pointer(from->rt6i_exception_bucket)) @@ -1459,7 +1459,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) struct rt6_exception *rt6_ex; if (!from || - !(rt->rt6i_flags | RTF_CACHE)) + !(rt->rt6i_flags & RTF_CACHE)) return; rcu_read_lock(); -- cgit v1.2.3 From d66f2b91f95b56e31772b9faa0d036cd2e53cb02 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:14 -0700 Subject: bpf: don't rely on the verifier lock for metadata_dst allocation bpf_skb_set_tunnel_*() functions require allocation of per-cpu metadata_dst. The allocation happens upon verification of the first program using those helpers. In preparation for removing the verifier lock, use cmpxchg() to make sure we only allocate the metadata_dsts once. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/dst.c | 16 ++++++++++++++++ net/core/filter.c | 16 +++++++++------- 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index a6c47da7d0f8..8b2eafac984d 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -322,3 +322,19 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); + +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) +{ + int cpu; + +#ifdef CONFIG_DST_CACHE + for_each_possible_cpu(cpu) { + struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); + + if (one_md_dst->type == METADATA_IP_TUNNEL) + dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); + } +#endif + free_percpu(md_dst); +} +EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); diff --git a/net/core/filter.c b/net/core/filter.c index b7e8caa1e790..140fa9f9c0f4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -2987,14 +2988,15 @@ static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* Race is not possible, since it's called from verifier - * that is holding verifier mutex. - */ - md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, - METADATA_IP_TUNNEL, - GFP_KERNEL); - if (!md_dst) + struct metadata_dst __percpu *tmp; + + tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + METADATA_IP_TUNNEL, + GFP_KERNEL); + if (!tmp) return NULL; + if (cmpxchg(&md_dst, NULL, tmp)) + metadata_dst_free_percpu(tmp); } switch (which) { -- cgit v1.2.3 From 9f77fad3c2e3b923984fc617ea45b60264115fc3 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Mon, 9 Oct 2017 11:37:59 -0400 Subject: net/core: Fix BUG to BUG_ON conditionals. Fix BUG() calls to use BUG_ON(conditional) macros. This was found using make coccicheck M=net/core on linux next tag next-2017092 Signed-off-by: Tim Hansen Signed-off-by: David S. Miller --- net/core/skbuff.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 822a90e56aea..40717501cbdd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1350,8 +1350,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) /* Set the tail pointer and length */ skb_put(n, skb->len); - if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) - BUG(); + BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); copy_skb_header(n, skb); return n; @@ -1449,8 +1448,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, BUG_ON(nhead < 0); - if (skb_shared(skb)) - BUG(); + BUG_ON(skb_shared(skb)); size = SKB_DATA_ALIGN(size); @@ -1595,9 +1593,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, head_copy_off = newheadroom - head_copy_len; /* Copy the linear header and data. */ - if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, - skb->len + head_copy_len)) - BUG(); + BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)); copy_skb_header(n, skb); @@ -1878,8 +1875,8 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) return NULL; } - if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) - BUG(); + BUG_ON(skb_copy_bits(skb, skb_headlen(skb), + skb_tail_pointer(skb), delta)); /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. -- cgit v1.2.3 From b88d12e4a4ff5b99dcaf9b35a30ad2597cb2fd8a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 10 Oct 2017 17:10:04 +0200 Subject: rtnetlink: bridge: use ext_ack instead of printk We can now piggyback error strings to userspace via extended acks rather than using printk. Before: bridge fdb add 01:02:03:04:05:06 dev br0 vlan 4095 RTNETLINK answers: Invalid argument After: bridge fdb add 01:02:03:04:05:06 dev br0 vlan 4095 Error: invalid vlan id. v3: drop 'RTM_' prefixes, suggested by David Ahern, they are not useful, the add/del in bridge command line is enough. Also reword error in response to malformed/bad vlan id attribute size. Cc: David Ahern Signed-off-by: Florian Westphal Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e84d108cfee4..6a09f3d575af 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3066,21 +3066,21 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_add); -static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid) +static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, + struct netlink_ext_ack *extack) { u16 vid = 0; if (vlan_attr) { if (nla_len(vlan_attr) != sizeof(u16)) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n"); + NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); return -EINVAL; } vid = nla_get_u16(vlan_attr); if (!vid || vid >= VLAN_VID_MASK) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n", - vid); + NL_SET_ERR_MSG(extack, "invalid vlan id"); return -EINVAL; } } @@ -3105,24 +3105,24 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n"); + NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n"); + NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); - err = fdb_vid_parse(tb[NDA_VLAN], &vid); + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; @@ -3209,24 +3209,24 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n"); + NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n"); + NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); - err = fdb_vid_parse(tb[NDA_VLAN], &vid); + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; @@ -3666,7 +3666,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { - pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } @@ -3741,7 +3741,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { - pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } -- cgit v1.2.3 From 833e0e2f24fd0525090878f71e129a8a4cb8bf78 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 10 Oct 2017 15:05:39 -0700 Subject: net: dst: move cpu inside ifdef to avoid compilation warning If CONFIG_DST_CACHE is not selected cpu variable will be unused and we will see a compilation warning. Move it under the ifdef. Reported-by: kbuild test robot Fixes: d66f2b91f95b ("bpf: don't rely on the verifier lock for metadata_dst allocation") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/dst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 8b2eafac984d..662a2d4a3d19 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -325,9 +325,9 @@ EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) { +#ifdef CONFIG_DST_CACHE int cpu; -#ifdef CONFIG_DST_CACHE for_each_possible_cpu(cpu) { struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); -- cgit v1.2.3 From b8226962b1c49c784aeddb9d2fafbf53dfdc2190 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Tue, 10 Oct 2017 16:54:44 -0400 Subject: openvswitch: add ct_clear action This adds a ct_clear action for clearing conntrack state. ct_clear is currently implemented in OVS userspace, but is not backed by an action in the kernel datapath. This is useful for flows that may modify a packet tuple after a ct lookup has already occurred. Signed-off-by: Eric Garver Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 4 ++++ net/openvswitch/conntrack.c | 11 +++++++++++ net/openvswitch/conntrack.h | 7 +++++++ net/openvswitch/flow_netlink.c | 5 +++++ 4 files changed, 27 insertions(+) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index a54a556fcdb5..a551232daf61 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return err == -EINPROGRESS ? 0 : err; break; + case OVS_ACTION_ATTR_CT_CLEAR: + err = ovs_ct_clear(skb, key); + break; + case OVS_ACTION_ATTR_PUSH_ETH: err = push_eth(skb, key, nla_data(a)); break; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index d558e882ca0c..fe861e2f0deb 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1129,6 +1129,17 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, return err; } +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) +{ + if (skb_nfct(skb)) { + nf_conntrack_put(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + ovs_ct_fill_key(skb, key); + } + + return 0; +} + static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, const struct sw_flow_key *key, bool log) { diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index bc7efd1867ab..399dfdd2c4f9 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -30,6 +30,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, const struct ovs_conntrack_info *); +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key); void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); int ovs_ct_put_key(const struct sw_flow_key *swkey, @@ -73,6 +74,12 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, return -ENOTSUPP; } +static inline int ovs_ct_clear(struct sk_buff *skb, + struct sw_flow_key *key) +{ + return -ENOTSUPP; +} + static inline void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index fc0ca9a89b8e..dc0d79092e74 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -76,6 +76,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) break; case OVS_ACTION_ATTR_CT: + case OVS_ACTION_ATTR_CT_CLEAR: case OVS_ACTION_ATTR_HASH: case OVS_ACTION_ATTR_POP_ETH: case OVS_ACTION_ATTR_POP_MPLS: @@ -2528,6 +2529,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), [OVS_ACTION_ATTR_CT] = (u32)-1, + [OVS_ACTION_ATTR_CT_CLEAR] = 0, [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), [OVS_ACTION_ATTR_POP_ETH] = 0, @@ -2669,6 +2671,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_CT_CLEAR: + break; + case OVS_ACTION_ATTR_PUSH_ETH: /* Disallow pushing an Ethernet header if one * is already present */ -- cgit v1.2.3 From 4133da73067af0417c623eb4ad5e85081ccbf4b4 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 10 Oct 2017 22:31:49 -0400 Subject: mac80211: aead api to reduce redundancy Currently, the aes_ccm.c and aes_gcm.c are almost line by line copy of each other. This patch reduce code redundancy by moving the code in these two files to crypto/aead_api.c to make it a higher level aead api. The file aes_ccm.c and aes_gcm.c are removed and all the functions there are now implemented in their headers using the newly added aead api. Signed-off-by: Xiang Gao Signed-off-by: Johannes Berg --- net/mac80211/Makefile | 3 +- net/mac80211/aead_api.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++ net/mac80211/aead_api.h | 27 ++++++++++++ net/mac80211/aes_ccm.c | 115 ------------------------------------------------ net/mac80211/aes_ccm.h | 42 +++++++++++++----- net/mac80211/aes_gcm.c | 109 --------------------------------------------- net/mac80211/aes_gcm.h | 38 ++++++++++++---- net/mac80211/wpa.c | 4 +- 8 files changed, 206 insertions(+), 247 deletions(-) create mode 100644 net/mac80211/aead_api.c create mode 100644 net/mac80211/aead_api.h delete mode 100644 net/mac80211/aes_ccm.c delete mode 100644 net/mac80211/aes_gcm.c (limited to 'net') diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 282912245938..80f25ff2f24b 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -6,6 +6,7 @@ mac80211-y := \ driver-ops.o \ sta_info.o \ wep.o \ + aead_api.o \ wpa.o \ scan.o offchannel.o \ ht.o agg-tx.o agg-rx.o \ @@ -15,8 +16,6 @@ mac80211-y := \ rate.o \ michael.o \ tkip.o \ - aes_ccm.o \ - aes_gcm.o \ aes_cmac.o \ aes_gmac.o \ fils_aead.o \ diff --git a/net/mac80211/aead_api.c b/net/mac80211/aead_api.c new file mode 100644 index 000000000000..347f13953b2c --- /dev/null +++ b/net/mac80211/aead_api.c @@ -0,0 +1,115 @@ +/* + * Copyright 2003-2004, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2014-2015, Qualcomm Atheros, Inc. + * + * Rewrite: Copyright (C) 2013 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include "aead_api.h" + +int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, + u8 *data, size_t data_len, u8 *mic) +{ + size_t mic_len = tfm->authsize; + struct scatterlist sg[3]; + struct aead_request *aead_req; + int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); + u8 *__aad; + + aead_req = kzalloc(reqsize + aad_len, GFP_ATOMIC); + if (!aead_req) + return -ENOMEM; + + __aad = (u8 *)aead_req + reqsize; + memcpy(__aad, aad, aad_len); + + sg_init_table(sg, 3); + sg_set_buf(&sg[0], __aad, aad_len); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, mic_len); + + aead_request_set_tfm(aead_req, tfm); + aead_request_set_crypt(aead_req, sg, sg, data_len, b_0); + aead_request_set_ad(aead_req, sg[0].length); + + crypto_aead_encrypt(aead_req); + kzfree(aead_req); + + return 0; +} + +int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, + u8 *data, size_t data_len, u8 *mic) +{ + size_t mic_len = tfm->authsize; + struct scatterlist sg[3]; + struct aead_request *aead_req; + int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); + u8 *__aad; + int err; + + if (data_len == 0) + return -EINVAL; + + aead_req = kzalloc(reqsize + aad_len, GFP_ATOMIC); + if (!aead_req) + return -ENOMEM; + + __aad = (u8 *)aead_req + reqsize; + memcpy(__aad, aad, aad_len); + + sg_init_table(sg, 3); + sg_set_buf(&sg[0], __aad, aad_len); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, mic_len); + + aead_request_set_tfm(aead_req, tfm); + aead_request_set_crypt(aead_req, sg, sg, data_len + mic_len, b_0); + aead_request_set_ad(aead_req, sg[0].length); + + err = crypto_aead_decrypt(aead_req); + kzfree(aead_req); + + return err; +} + +struct crypto_aead * +aead_key_setup_encrypt(const char *alg, const u8 key[], + size_t key_len, size_t mic_len) +{ + struct crypto_aead *tfm; + int err; + + tfm = crypto_alloc_aead(alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return tfm; + + err = crypto_aead_setkey(tfm, key, key_len); + if (err) + goto free_aead; + err = crypto_aead_setauthsize(tfm, mic_len); + if (err) + goto free_aead; + + return tfm; + +free_aead: + crypto_free_aead(tfm); + return ERR_PTR(err); +} + +void aead_key_free(struct crypto_aead *tfm) +{ + crypto_free_aead(tfm); +} diff --git a/net/mac80211/aead_api.h b/net/mac80211/aead_api.h new file mode 100644 index 000000000000..5e39ea843bbf --- /dev/null +++ b/net/mac80211/aead_api.h @@ -0,0 +1,27 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _AEAD_API_H +#define _AEAD_API_H + +#include +#include + +struct crypto_aead * +aead_key_setup_encrypt(const char *alg, const u8 key[], + size_t key_len, size_t mic_len); + +int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, + size_t aad_len, u8 *data, + size_t data_len, u8 *mic); + +int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, + size_t aad_len, u8 *data, + size_t data_len, u8 *mic); + +void aead_key_free(struct crypto_aead *tfm); + +#endif /* _AEAD_API_H */ diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c deleted file mode 100644 index a4e0d59a40dd..000000000000 --- a/net/mac80211/aes_ccm.c +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright 2003-2004, Instant802 Networks, Inc. - * Copyright 2005-2006, Devicescape Software, Inc. - * - * Rewrite: Copyright (C) 2013 Linaro Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include - -#include -#include "key.h" -#include "aes_ccm.h" - -int ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic, - size_t mic_len) -{ - struct scatterlist sg[3]; - struct aead_request *aead_req; - int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); - u8 *__aad; - - aead_req = kzalloc(reqsize + CCM_AAD_LEN, GFP_ATOMIC); - if (!aead_req) - return -ENOMEM; - - __aad = (u8 *)aead_req + reqsize; - memcpy(__aad, aad, CCM_AAD_LEN); - - sg_init_table(sg, 3); - sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad)); - sg_set_buf(&sg[1], data, data_len); - sg_set_buf(&sg[2], mic, mic_len); - - aead_request_set_tfm(aead_req, tfm); - aead_request_set_crypt(aead_req, sg, sg, data_len, b_0); - aead_request_set_ad(aead_req, sg[0].length); - - crypto_aead_encrypt(aead_req); - kzfree(aead_req); - - return 0; -} - -int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic, - size_t mic_len) -{ - struct scatterlist sg[3]; - struct aead_request *aead_req; - int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); - u8 *__aad; - int err; - - if (data_len == 0) - return -EINVAL; - - aead_req = kzalloc(reqsize + CCM_AAD_LEN, GFP_ATOMIC); - if (!aead_req) - return -ENOMEM; - - __aad = (u8 *)aead_req + reqsize; - memcpy(__aad, aad, CCM_AAD_LEN); - - sg_init_table(sg, 3); - sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad)); - sg_set_buf(&sg[1], data, data_len); - sg_set_buf(&sg[2], mic, mic_len); - - aead_request_set_tfm(aead_req, tfm); - aead_request_set_crypt(aead_req, sg, sg, data_len + mic_len, b_0); - aead_request_set_ad(aead_req, sg[0].length); - - err = crypto_aead_decrypt(aead_req); - kzfree(aead_req); - - return err; -} - -struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[], - size_t key_len, - size_t mic_len) -{ - struct crypto_aead *tfm; - int err; - - tfm = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return tfm; - - err = crypto_aead_setkey(tfm, key, key_len); - if (err) - goto free_aead; - err = crypto_aead_setauthsize(tfm, mic_len); - if (err) - goto free_aead; - - return tfm; - -free_aead: - crypto_free_aead(tfm); - return ERR_PTR(err); -} - -void ieee80211_aes_key_free(struct crypto_aead *tfm) -{ - crypto_free_aead(tfm); -} diff --git a/net/mac80211/aes_ccm.h b/net/mac80211/aes_ccm.h index fcd3254c5cf0..e9b7ca0bde5b 100644 --- a/net/mac80211/aes_ccm.h +++ b/net/mac80211/aes_ccm.h @@ -10,19 +10,39 @@ #ifndef AES_CCM_H #define AES_CCM_H -#include +#include "aead_api.h" #define CCM_AAD_LEN 32 -struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[], - size_t key_len, - size_t mic_len); -int ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic, - size_t mic_len); -int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic, - size_t mic_len); -void ieee80211_aes_key_free(struct crypto_aead *tfm); +static inline struct crypto_aead * +ieee80211_aes_key_setup_encrypt(const u8 key[], size_t key_len, size_t mic_len) +{ + return aead_key_setup_encrypt("ccm(aes)", key, key_len, mic_len); +} + +static inline int +ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, + u8 *b_0, u8 *aad, u8 *data, + size_t data_len, u8 *mic) +{ + return aead_encrypt(tfm, b_0, aad + 2, + be16_to_cpup((__be16 *)aad), + data, data_len, mic); +} + +static inline int +ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, + u8 *b_0, u8 *aad, u8 *data, + size_t data_len, u8 *mic) +{ + return aead_decrypt(tfm, b_0, aad + 2, + be16_to_cpup((__be16 *)aad), + data, data_len, mic); +} + +static inline void ieee80211_aes_key_free(struct crypto_aead *tfm) +{ + return aead_key_free(tfm); +} #endif /* AES_CCM_H */ diff --git a/net/mac80211/aes_gcm.c b/net/mac80211/aes_gcm.c deleted file mode 100644 index 8a4397cc1b08..000000000000 --- a/net/mac80211/aes_gcm.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright 2014-2015, Qualcomm Atheros, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include - -#include -#include "key.h" -#include "aes_gcm.h" - -int ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic) -{ - struct scatterlist sg[3]; - struct aead_request *aead_req; - int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); - u8 *__aad; - - aead_req = kzalloc(reqsize + GCM_AAD_LEN, GFP_ATOMIC); - if (!aead_req) - return -ENOMEM; - - __aad = (u8 *)aead_req + reqsize; - memcpy(__aad, aad, GCM_AAD_LEN); - - sg_init_table(sg, 3); - sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad)); - sg_set_buf(&sg[1], data, data_len); - sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN); - - aead_request_set_tfm(aead_req, tfm); - aead_request_set_crypt(aead_req, sg, sg, data_len, j_0); - aead_request_set_ad(aead_req, sg[0].length); - - crypto_aead_encrypt(aead_req); - kzfree(aead_req); - return 0; -} - -int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic) -{ - struct scatterlist sg[3]; - struct aead_request *aead_req; - int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); - u8 *__aad; - int err; - - if (data_len == 0) - return -EINVAL; - - aead_req = kzalloc(reqsize + GCM_AAD_LEN, GFP_ATOMIC); - if (!aead_req) - return -ENOMEM; - - __aad = (u8 *)aead_req + reqsize; - memcpy(__aad, aad, GCM_AAD_LEN); - - sg_init_table(sg, 3); - sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad)); - sg_set_buf(&sg[1], data, data_len); - sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN); - - aead_request_set_tfm(aead_req, tfm); - aead_request_set_crypt(aead_req, sg, sg, - data_len + IEEE80211_GCMP_MIC_LEN, j_0); - aead_request_set_ad(aead_req, sg[0].length); - - err = crypto_aead_decrypt(aead_req); - kzfree(aead_req); - - return err; -} - -struct crypto_aead *ieee80211_aes_gcm_key_setup_encrypt(const u8 key[], - size_t key_len) -{ - struct crypto_aead *tfm; - int err; - - tfm = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return tfm; - - err = crypto_aead_setkey(tfm, key, key_len); - if (err) - goto free_aead; - err = crypto_aead_setauthsize(tfm, IEEE80211_GCMP_MIC_LEN); - if (err) - goto free_aead; - - return tfm; - -free_aead: - crypto_free_aead(tfm); - return ERR_PTR(err); -} - -void ieee80211_aes_gcm_key_free(struct crypto_aead *tfm) -{ - crypto_free_aead(tfm); -} diff --git a/net/mac80211/aes_gcm.h b/net/mac80211/aes_gcm.h index 55aed5352494..d2b096033009 100644 --- a/net/mac80211/aes_gcm.h +++ b/net/mac80211/aes_gcm.h @@ -9,16 +9,38 @@ #ifndef AES_GCM_H #define AES_GCM_H -#include +#include "aead_api.h" #define GCM_AAD_LEN 32 -int ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic); -int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic); -struct crypto_aead *ieee80211_aes_gcm_key_setup_encrypt(const u8 key[], - size_t key_len); -void ieee80211_aes_gcm_key_free(struct crypto_aead *tfm); +static inline int ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, + u8 *j_0, u8 *aad, u8 *data, + size_t data_len, u8 *mic) +{ + return aead_encrypt(tfm, j_0, aad + 2, + be16_to_cpup((__be16 *)aad), + data, data_len, mic); +} + +static inline int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, + u8 *j_0, u8 *aad, u8 *data, + size_t data_len, u8 *mic) +{ + return aead_decrypt(tfm, j_0, aad + 2, + be16_to_cpup((__be16 *)aad), + data, data_len, mic); +} + +static inline struct crypto_aead * +ieee80211_aes_gcm_key_setup_encrypt(const u8 key[], size_t key_len) +{ + return aead_key_setup_encrypt("gcm(aes)", key, + key_len, IEEE80211_GCMP_MIC_LEN); +} + +static inline void ieee80211_aes_gcm_key_free(struct crypto_aead *tfm) +{ + return aead_key_free(tfm); +} #endif /* AES_GCM_H */ diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 0d722ea98a1b..b58722d9de37 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -464,7 +464,7 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb, pos += IEEE80211_CCMP_HDR_LEN; ccmp_special_blocks(skb, pn, b_0, aad); return ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len, - skb_put(skb, mic_len), mic_len); + skb_put(skb, mic_len)); } @@ -543,7 +543,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, key->u.ccmp.tfm, b_0, aad, skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN, data_len, - skb->data + skb->len - mic_len, mic_len)) + skb->data + skb->len - mic_len)) return RX_DROP_UNUSABLE; } -- cgit v1.2.3 From 2a9e25796b289f71c0802eca46005c750c57af95 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 6 Oct 2017 11:53:33 +0200 Subject: mac80211: only remove AP VLAN frames from TXQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When removing an AP VLAN interface, mac80211 currently purges the entire TXQ for the AP interface. Fix this by using the FQ API introduced in the previous patch to filter frames. Signed-off-by: Johannes Berg Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/iface.c | 25 +++---------------------- net/mac80211/tx.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9675814f64db..68f874e73561 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2009,6 +2009,8 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct txq_info *txq, int tid); void ieee80211_txq_purge(struct ieee80211_local *local, struct txq_info *txqi); +void ieee80211_txq_remove_vlan(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 2619daa29961..13b16f90e1cf 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -793,9 +793,7 @@ static int ieee80211_open(struct net_device *dev) static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { - struct ieee80211_sub_if_data *txq_sdata = sdata; struct ieee80211_local *local = sdata->local; - struct fq *fq = &local->fq; unsigned long flags; struct sk_buff *skb, *tmp; u32 hw_reconf_flags = 0; @@ -939,9 +937,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: - txq_sdata = container_of(sdata->bss, - struct ieee80211_sub_if_data, u.ap); - mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); mutex_unlock(&local->mtx); @@ -998,8 +993,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, skb_queue_purge(&sdata->skb_queue); } - sdata->bss = NULL; - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { skb_queue_walk_safe(&local->pending[i], skb, tmp) { @@ -1012,22 +1005,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - if (txq_sdata->vif.txq) { - struct txq_info *txqi = to_txq_info(txq_sdata->vif.txq); + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + ieee80211_txq_remove_vlan(local, sdata); - /* - * FIXME FIXME - * - * We really shouldn't purge the *entire* txqi since that - * contains frames for the other AP_VLANs (and possibly - * the AP itself) as well, but there's no API in FQ now - * to be able to filter. - */ - - spin_lock_bh(&fq->lock); - ieee80211_txq_purge(local, txqi); - spin_unlock_bh(&fq->lock); - } + sdata->bss = NULL; if (local->open_count == 0) ieee80211_clear_tx_pending(local); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 94826680cf2b..7b8154474b9e 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1396,6 +1396,40 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, fq_flow_get_default_func); } +static bool fq_vlan_filter_func(struct fq *fq, struct fq_tin *tin, + struct fq_flow *flow, struct sk_buff *skb, + void *data) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + return info->control.vif == data; +} + +void ieee80211_txq_remove_vlan(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + struct fq *fq = &local->fq; + struct txq_info *txqi; + struct fq_tin *tin; + struct ieee80211_sub_if_data *ap; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) + return; + + ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); + + if (!ap->vif.txq) + return; + + txqi = to_txq_info(ap->vif.txq); + tin = &txqi->tin; + + spin_lock_bh(&fq->lock); + fq_tin_filter(fq, tin, fq_vlan_filter_func, &sdata->vif, + fq_skb_free_func); + spin_unlock_bh(&fq->lock); +} + void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct txq_info *txqi, int tid) -- cgit v1.2.3 From 007f6c5e6eb45c81ee89368a5f226572ae638831 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 15 Oct 2015 11:22:58 +0200 Subject: cfg80211: support loading regulatory database as firmware file As the current regulatory database is only about 4k big, and already difficult to extend, we decided that overall it would be better to get rid of the complications with CRDA and load the database into the kernel directly, but in a new format that is extensible. The new file format can be extended since it carries a length field on all the structs that need to be extensible. In order to be able to request firmware when the module initializes, move cfg80211 from subsys_initcall() to the later fs_initcall(); the firmware loader is at the same level but linked earlier, so it can be called from there. Otherwise, when both the firmware loader and cfg80211 are built-in, the request will crash the kernel. We also need to be before device_initcall() so that cfg80211 is available for devices when they initialize. Signed-off-by: Johannes Berg --- net/wireless/Kconfig | 4 +- net/wireless/core.c | 2 +- net/wireless/reg.c | 294 +++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 276 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 6c606120abfe..24eec5516649 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -19,6 +19,7 @@ config WEXT_PRIV config CFG80211 tristate "cfg80211 - wireless configuration API" depends on RFKILL || !RFKILL + select FW_LOADER ---help--- cfg80211 is the Linux wireless LAN (802.11) configuration API. Enable this if you have a wireless device. @@ -167,7 +168,8 @@ config CFG80211_CRDA_SUPPORT depends on CFG80211 help You should enable this option unless you know for sure you have no - need for it, for example when using internal regdb (above.) + need for it, for example when using internal regdb (above) or the + database loaded as a firmware file. If unsure, say Y. diff --git a/net/wireless/core.c b/net/wireless/core.c index 7b33e8c366bc..fdde0d98fde1 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1384,7 +1384,7 @@ out_fail_sysfs: out_fail_pernet: return err; } -subsys_initcall(cfg80211_init); +fs_initcall(cfg80211_init); static void __exit cfg80211_exit(void) { diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 6e94f6934a0e..e9aeb05aaf3e 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include "core.h" #include "reg.h" @@ -100,7 +101,7 @@ static struct regulatory_request core_request_world = { static struct regulatory_request __rcu *last_request = (void __force __rcu *)&core_request_world; -/* To trigger userspace events */ +/* To trigger userspace events and load firmware */ static struct platform_device *reg_pdev; /* @@ -443,7 +444,6 @@ reg_copy_regd(const struct ieee80211_regdomain *src_regd) return regd; } -#ifdef CONFIG_CFG80211_INTERNAL_REGDB struct reg_regdb_apply_request { struct list_head list; const struct ieee80211_regdomain *regdom; @@ -475,41 +475,44 @@ static void reg_regdb_apply(struct work_struct *work) static DECLARE_WORK(reg_regdb_work, reg_regdb_apply); -static int reg_query_builtin(const char *alpha2) +static int reg_schedule_apply(const struct ieee80211_regdomain *regdom) { - const struct ieee80211_regdomain *regdom = NULL; struct reg_regdb_apply_request *request; - unsigned int i; - - for (i = 0; i < reg_regdb_size; i++) { - if (alpha2_equal(alpha2, reg_regdb[i]->alpha2)) { - regdom = reg_regdb[i]; - break; - } - } - - if (!regdom) - return -ENODATA; request = kzalloc(sizeof(struct reg_regdb_apply_request), GFP_KERNEL); - if (!request) - return -ENOMEM; - - request->regdom = reg_copy_regd(regdom); - if (IS_ERR_OR_NULL(request->regdom)) { - kfree(request); + if (!request) { + kfree(regdom); return -ENOMEM; } + request->regdom = regdom; + mutex_lock(®_regdb_apply_mutex); list_add_tail(&request->list, ®_regdb_apply_list); mutex_unlock(®_regdb_apply_mutex); schedule_work(®_regdb_work); - return 0; } +#ifdef CONFIG_CFG80211_INTERNAL_REGDB +static int reg_query_builtin(const char *alpha2) +{ + const struct ieee80211_regdomain *regdom = NULL; + unsigned int i; + + for (i = 0; i < reg_regdb_size; i++) { + if (alpha2_equal(alpha2, reg_regdb[i]->alpha2)) { + regdom = reg_copy_regd(reg_regdb[i]); + break; + } + } + if (!regdom) + return -ENODATA; + + return reg_schedule_apply(regdom); +} + /* Feel free to add any other sanity checks here */ static void reg_regdb_size_check(void) { @@ -599,12 +602,256 @@ static inline int call_crda(const char *alpha2) } #endif /* CONFIG_CFG80211_CRDA_SUPPORT */ +/* code to directly load a firmware database through request_firmware */ +static const struct fwdb_header *regdb; + +struct fwdb_country { + u8 alpha2[2]; + __be16 coll_ptr; + /* this struct cannot be extended */ +} __packed __aligned(4); + +struct fwdb_collection { + u8 len; + u8 n_rules; + u8 dfs_region; + /* no optional data yet */ + /* aligned to 2, then followed by __be16 array of rule pointers */ +} __packed __aligned(4); + +enum fwdb_flags { + FWDB_FLAG_NO_OFDM = BIT(0), + FWDB_FLAG_NO_OUTDOOR = BIT(1), + FWDB_FLAG_DFS = BIT(2), + FWDB_FLAG_NO_IR = BIT(3), + FWDB_FLAG_AUTO_BW = BIT(4), +}; + +struct fwdb_rule { + u8 len; + u8 flags; + __be16 max_eirp; + __be32 start, end, max_bw; + /* start of optional data */ + __be16 cac_timeout; +} __packed __aligned(4); + +#define FWDB_MAGIC 0x52474442 +#define FWDB_VERSION 20 + +struct fwdb_header { + __be32 magic; + __be32 version; + struct fwdb_country country[]; +} __packed __aligned(4); + +static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr) +{ + struct fwdb_rule *rule = (void *)(data + (rule_ptr << 2)); + + if ((u8 *)rule + sizeof(rule->len) > data + size) + return false; + + /* mandatory fields */ + if (rule->len < offsetofend(struct fwdb_rule, max_bw)) + return false; + + return true; +} + +static bool valid_country(const u8 *data, unsigned int size, + const struct fwdb_country *country) +{ + unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; + struct fwdb_collection *coll = (void *)(data + ptr); + __be16 *rules_ptr; + unsigned int i; + + /* make sure we can read len/n_rules */ + if ((u8 *)coll + offsetofend(typeof(*coll), n_rules) > data + size) + return false; + + /* make sure base struct and all rules fit */ + if ((u8 *)coll + ALIGN(coll->len, 2) + + (coll->n_rules * 2) > data + size) + return false; + + /* mandatory fields must exist */ + if (coll->len < offsetofend(struct fwdb_collection, dfs_region)) + return false; + + rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); + + for (i = 0; i < coll->n_rules; i++) { + u16 rule_ptr = be16_to_cpu(rules_ptr[i]); + + if (!valid_rule(data, size, rule_ptr)) + return false; + } + + return true; +} + +static bool valid_regdb(const u8 *data, unsigned int size) +{ + const struct fwdb_header *hdr = (void *)data; + const struct fwdb_country *country; + + if (size < sizeof(*hdr)) + return false; + + if (hdr->magic != cpu_to_be32(FWDB_MAGIC)) + return false; + + if (hdr->version != cpu_to_be32(FWDB_VERSION)) + return false; + + country = &hdr->country[0]; + while ((u8 *)(country + 1) <= data + size) { + if (!country->coll_ptr) + break; + if (!valid_country(data, size, country)) + return false; + country++; + } + + return true; +} + +static int regdb_query_country(const struct fwdb_header *db, + const struct fwdb_country *country) +{ + unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; + struct fwdb_collection *coll = (void *)((u8 *)db + ptr); + struct ieee80211_regdomain *regdom; + unsigned int size_of_regd; + unsigned int i; + + size_of_regd = + sizeof(struct ieee80211_regdomain) + + coll->n_rules * sizeof(struct ieee80211_reg_rule); + + regdom = kzalloc(size_of_regd, GFP_KERNEL); + if (!regdom) + return -ENOMEM; + + regdom->n_reg_rules = coll->n_rules; + regdom->alpha2[0] = country->alpha2[0]; + regdom->alpha2[1] = country->alpha2[1]; + regdom->dfs_region = coll->dfs_region; + + for (i = 0; i < regdom->n_reg_rules; i++) { + __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); + unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; + struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr); + struct ieee80211_reg_rule *rrule = ®dom->reg_rules[i]; + + rrule->freq_range.start_freq_khz = be32_to_cpu(rule->start); + rrule->freq_range.end_freq_khz = be32_to_cpu(rule->end); + rrule->freq_range.max_bandwidth_khz = be32_to_cpu(rule->max_bw); + + rrule->power_rule.max_antenna_gain = 0; + rrule->power_rule.max_eirp = be16_to_cpu(rule->max_eirp); + + rrule->flags = 0; + if (rule->flags & FWDB_FLAG_NO_OFDM) + rrule->flags |= NL80211_RRF_NO_OFDM; + if (rule->flags & FWDB_FLAG_NO_OUTDOOR) + rrule->flags |= NL80211_RRF_NO_OUTDOOR; + if (rule->flags & FWDB_FLAG_DFS) + rrule->flags |= NL80211_RRF_DFS; + if (rule->flags & FWDB_FLAG_NO_IR) + rrule->flags |= NL80211_RRF_NO_IR; + if (rule->flags & FWDB_FLAG_AUTO_BW) + rrule->flags |= NL80211_RRF_AUTO_BW; + + rrule->dfs_cac_ms = 0; + + /* handle optional data */ + if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout)) + rrule->dfs_cac_ms = + 1000 * be16_to_cpu(rule->cac_timeout); + } + + return reg_schedule_apply(regdom); +} + +static int query_regdb(const char *alpha2) +{ + const struct fwdb_header *hdr = regdb; + const struct fwdb_country *country; + + if (IS_ERR(regdb)) + return PTR_ERR(regdb); + + country = &hdr->country[0]; + while (country->coll_ptr) { + if (alpha2_equal(alpha2, country->alpha2)) + return regdb_query_country(regdb, country); + country++; + } + + return -ENODATA; +} + +static void regdb_fw_cb(const struct firmware *fw, void *context) +{ + void *db; + + if (!fw) { + pr_info("failed to load regulatory.db\n"); + regdb = ERR_PTR(-ENODATA); + goto restore; + } + + if (!valid_regdb(fw->data, fw->size)) { + pr_info("loaded regulatory.db is malformed\n"); + release_firmware(fw); + regdb = ERR_PTR(-EINVAL); + goto restore; + } + + db = kmemdup(fw->data, fw->size, GFP_KERNEL); + release_firmware(fw); + + if (!db) + goto restore; + regdb = db; + + if (query_regdb(context)) + goto restore; + goto free; + restore: + rtnl_lock(); + restore_regulatory_settings(true); + rtnl_unlock(); + free: + kfree(context); +} + +static int query_regdb_file(const char *alpha2) +{ + if (regdb) + return query_regdb(alpha2); + + alpha2 = kmemdup(alpha2, 2, GFP_KERNEL); + if (!alpha2) + return -ENOMEM; + + return request_firmware_nowait(THIS_MODULE, true, "regulatory.db", + ®_pdev->dev, GFP_KERNEL, + (void *)alpha2, regdb_fw_cb); +} + static bool reg_query_database(struct regulatory_request *request) { /* query internal regulatory database (if it exists) */ if (reg_query_builtin(request->alpha2) == 0) return true; + if (query_regdb_file(request->alpha2) == 0) + return true; + if (call_crda(request->alpha2) == 0) return true; @@ -3360,4 +3607,7 @@ void regulatory_exit(void) list_del(®_request->list); kfree(reg_request); } + + if (!IS_ERR_OR_NULL(regdb)) + kfree(regdb); } -- cgit v1.2.3 From 1ea4ff3e9f0b8d53e680a2bb9e8e644bf03aeb4d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 13 Sep 2017 16:07:22 +0200 Subject: cfg80211: support reloading regulatory database If the regulatory database is loaded, and then updated, it may be necessary to reload it. Add an nl80211 command to do this. Note that this just reloads the database, it doesn't re-apply the rules from it immediately. Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 11 +++++++ net/wireless/reg.c | 80 +++++++++++++++++++++++++++++++++++++------------- net/wireless/reg.h | 6 ++++ 3 files changed, 77 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5129342151e6..67a03f2885a4 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5678,6 +5678,11 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) } } +static int nl80211_reload_regdb(struct sk_buff *skb, struct genl_info *info) +{ + return reg_reload_regdb(); +} + static int nl80211_get_mesh_config(struct sk_buff *skb, struct genl_info *info) { @@ -12708,6 +12713,12 @@ static const struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NL80211_CMD_RELOAD_REGDB, + .doit = nl80211_reload_regdb, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, { .cmd = NL80211_CMD_GET_MESH_CONFIG, .doit = nl80211_get_mesh_config, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index e9aeb05aaf3e..180addda52af 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -781,6 +781,8 @@ static int query_regdb(const char *alpha2) const struct fwdb_header *hdr = regdb; const struct fwdb_country *country; + ASSERT_RTNL(); + if (IS_ERR(regdb)) return PTR_ERR(regdb); @@ -796,41 +798,47 @@ static int query_regdb(const char *alpha2) static void regdb_fw_cb(const struct firmware *fw, void *context) { + int set_error = 0; + bool restore = true; void *db; if (!fw) { pr_info("failed to load regulatory.db\n"); - regdb = ERR_PTR(-ENODATA); - goto restore; - } - - if (!valid_regdb(fw->data, fw->size)) { + set_error = -ENODATA; + } else if (!valid_regdb(fw->data, fw->size)) { pr_info("loaded regulatory.db is malformed\n"); - release_firmware(fw); - regdb = ERR_PTR(-EINVAL); - goto restore; + set_error = -EINVAL; } - db = kmemdup(fw->data, fw->size, GFP_KERNEL); - release_firmware(fw); + rtnl_lock(); + if (WARN_ON(regdb && !IS_ERR(regdb))) { + /* just restore and free new db */ + } else if (set_error) { + regdb = ERR_PTR(set_error); + } else if (fw) { + db = kmemdup(fw->data, fw->size, GFP_KERNEL); + if (db) { + regdb = db; + restore = context && query_regdb(context); + } else { + restore = true; + } + } - if (!db) - goto restore; - regdb = db; + if (restore) + restore_regulatory_settings(true); - if (query_regdb(context)) - goto restore; - goto free; - restore: - rtnl_lock(); - restore_regulatory_settings(true); rtnl_unlock(); - free: + kfree(context); + + release_firmware(fw); } static int query_regdb_file(const char *alpha2) { + ASSERT_RTNL(); + if (regdb) return query_regdb(alpha2); @@ -843,6 +851,38 @@ static int query_regdb_file(const char *alpha2) (void *)alpha2, regdb_fw_cb); } +int reg_reload_regdb(void) +{ + const struct firmware *fw; + void *db; + int err; + + err = request_firmware(&fw, "regulatory.db", ®_pdev->dev); + if (err) + return err; + + if (!valid_regdb(fw->data, fw->size)) { + err = -ENODATA; + goto out; + } + + db = kmemdup(fw->data, fw->size, GFP_KERNEL); + if (!db) { + err = -ENOMEM; + goto out; + } + + rtnl_lock(); + if (!IS_ERR_OR_NULL(regdb)) + kfree(regdb); + regdb = db; + rtnl_unlock(); + + out: + release_firmware(fw); + return err; +} + static bool reg_query_database(struct regulatory_request *request) { /* query internal regulatory database (if it exists) */ diff --git a/net/wireless/reg.h b/net/wireless/reg.h index ca7fedf2e7a1..9529c522611a 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -179,4 +179,10 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy, * @wiphy2 - wiphy it's dfs_region to be checked against that of wiphy1 */ bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2); + +/** + * reg_reload_regdb - reload the regulatory.db firmware file + */ +int reg_reload_regdb(void); + #endif /* __NET_WIRELESS_REG_H */ -- cgit v1.2.3 From c8c240e284b3d821011b4f680b3eaa99569b3756 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 15 Oct 2015 14:35:41 +0200 Subject: cfg80211: reg: remove support for built-in regdb Parsing and building C structures from a regdb is no longer needed since the "firmware" file (regulatory.db) can be linked into the kernel image to achieve the same effect. Signed-off-by: Johannes Berg --- net/wireless/.gitignore | 1 - net/wireless/Kconfig | 24 +------ net/wireless/Makefile | 6 -- net/wireless/db.txt | 17 ----- net/wireless/genregdb.awk | 158 ---------------------------------------------- net/wireless/reg.c | 39 ------------ net/wireless/regdb.h | 23 ------- 7 files changed, 1 insertion(+), 267 deletions(-) delete mode 100644 net/wireless/.gitignore delete mode 100644 net/wireless/db.txt delete mode 100644 net/wireless/genregdb.awk delete mode 100644 net/wireless/regdb.h (limited to 'net') diff --git a/net/wireless/.gitignore b/net/wireless/.gitignore deleted file mode 100644 index c33451b896d9..000000000000 --- a/net/wireless/.gitignore +++ /dev/null @@ -1 +0,0 @@ -regdb.c diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 24eec5516649..f050030055c5 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -140,30 +140,8 @@ config CFG80211_DEBUGFS If unsure, say N. -config CFG80211_INTERNAL_REGDB - bool "use statically compiled regulatory rules database" if EXPERT - default n - depends on CFG80211 - ---help--- - This option generates an internal data structure representing - the wireless regulatory rules described in net/wireless/db.txt - and includes code to query that database. This is an alternative - to using CRDA for defining regulatory rules for the kernel. - - Using this option requires some parsing of the db.txt at build time, - the parser will be upkept with the latest wireless-regdb updates but - older wireless-regdb formats will be ignored. The parser may later - be replaced to avoid issues with conflicts on versions of - wireless-regdb. - - For details see: - - http://wireless.kernel.org/en/developers/Regulatory - - Most distributions have a CRDA package. So if unsure, say N. - config CFG80211_CRDA_SUPPORT - bool "support CRDA" if CFG80211_INTERNAL_REGDB + bool "support CRDA" if EXPERT default y depends on CFG80211 help diff --git a/net/wireless/Makefile b/net/wireless/Makefile index d06e5015751a..5f20dac5d8c6 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -14,11 +14,5 @@ cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o cfg80211-$(CONFIG_OF) += of.o cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o -cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o CFLAGS_trace.o := -I$(src) - -$(obj)/regdb.c: $(src)/db.txt $(src)/genregdb.awk - @$(AWK) -f $(srctree)/$(src)/genregdb.awk < $< > $@ - -clean-files := regdb.c diff --git a/net/wireless/db.txt b/net/wireless/db.txt deleted file mode 100644 index a2fc3a09ccdc..000000000000 --- a/net/wireless/db.txt +++ /dev/null @@ -1,17 +0,0 @@ -# -# This file is a placeholder to prevent accidental build breakage if someone -# enables CONFIG_CFG80211_INTERNAL_REGDB. Almost no one actually needs to -# enable that build option. -# -# You should be using CRDA instead. It is even better if you use the CRDA -# package provided by your distribution, since they will probably keep it -# up-to-date on your behalf. -# -# If you _really_ intend to use CONFIG_CFG80211_INTERNAL_REGDB then you will -# need to replace this file with one containing appropriately formatted -# regulatory rules that cover the regulatory domains you will be using. Your -# best option is to extract the db.txt file from the wireless-regdb git -# repository: -# -# git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-regdb.git -# diff --git a/net/wireless/genregdb.awk b/net/wireless/genregdb.awk deleted file mode 100644 index baf2426b555a..000000000000 --- a/net/wireless/genregdb.awk +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/awk -f -# -# genregdb.awk -- generate regdb.c from db.txt -# -# Actually, it reads from stdin (presumed to be db.txt) and writes -# to stdout (presumed to be regdb.c), but close enough... -# -# Copyright 2009 John W. Linville -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -BEGIN { - active = 0 - rules = 0; - print "/*" - print " * DO NOT EDIT -- file generated from data in db.txt" - print " */" - print "" - print "#include " - print "#include " - print "#include \"regdb.h\"" - print "" - regdb = "const struct ieee80211_regdomain *reg_regdb[] = {\n" -} - -function parse_country_head() { - country=$2 - sub(/:/, "", country) - printf "static const struct ieee80211_regdomain regdom_%s = {\n", country - printf "\t.alpha2 = \"%s\",\n", country - if ($NF ~ /DFS-ETSI/) - printf "\t.dfs_region = NL80211_DFS_ETSI,\n" - else if ($NF ~ /DFS-FCC/) - printf "\t.dfs_region = NL80211_DFS_FCC,\n" - else if ($NF ~ /DFS-JP/) - printf "\t.dfs_region = NL80211_DFS_JP,\n" - printf "\t.reg_rules = {\n" - active = 1 - regdb = regdb "\t®dom_" country ",\n" -} - -function parse_reg_rule() -{ - flag_starts_at = 7 - - start = $1 - sub(/\(/, "", start) - end = $3 - bw = $5 - sub(/\),/, "", bw) - gain = 0 - power = $6 - # power might be in mW... - units = $7 - dfs_cac = 0 - - sub(/\(/, "", power) - sub(/\),/, "", power) - sub(/\),/, "", units) - sub(/\)/, "", units) - - if (units == "mW") { - flag_starts_at = 8 - power = 10 * log(power)/log(10) - if ($8 ~ /[[:digit:]]/) { - flag_starts_at = 9 - dfs_cac = $8 - } - } else { - if ($7 ~ /[[:digit:]]/) { - flag_starts_at = 8 - dfs_cac = $7 - } - } - sub(/\(/, "", dfs_cac) - sub(/\),/, "", dfs_cac) - flagstr = "" - for (i=flag_starts_at; i<=NF; i++) - flagstr = flagstr $i - split(flagstr, flagarray, ",") - flags = "" - for (arg in flagarray) { - if (flagarray[arg] == "NO-OFDM") { - flags = flags "\n\t\t\tNL80211_RRF_NO_OFDM | " - } else if (flagarray[arg] == "NO-CCK") { - flags = flags "\n\t\t\tNL80211_RRF_NO_CCK | " - } else if (flagarray[arg] == "NO-INDOOR") { - flags = flags "\n\t\t\tNL80211_RRF_NO_INDOOR | " - } else if (flagarray[arg] == "NO-OUTDOOR") { - flags = flags "\n\t\t\tNL80211_RRF_NO_OUTDOOR | " - } else if (flagarray[arg] == "DFS") { - flags = flags "\n\t\t\tNL80211_RRF_DFS | " - } else if (flagarray[arg] == "PTP-ONLY") { - flags = flags "\n\t\t\tNL80211_RRF_PTP_ONLY | " - } else if (flagarray[arg] == "PTMP-ONLY") { - flags = flags "\n\t\t\tNL80211_RRF_PTMP_ONLY | " - } else if (flagarray[arg] == "PASSIVE-SCAN") { - flags = flags "\n\t\t\tNL80211_RRF_NO_IR | " - } else if (flagarray[arg] == "NO-IBSS") { - flags = flags "\n\t\t\tNL80211_RRF_NO_IR | " - } else if (flagarray[arg] == "NO-IR") { - flags = flags "\n\t\t\tNL80211_RRF_NO_IR | " - } else if (flagarray[arg] == "AUTO-BW") { - flags = flags "\n\t\t\tNL80211_RRF_AUTO_BW | " - } - - } - flags = flags "0" - printf "\t\tREG_RULE_EXT(%d, %d, %d, %d, %.0f, %d, %s),\n", start, end, bw, gain, power, dfs_cac, flags - rules++ -} - -function print_tail_country() -{ - active = 0 - printf "\t},\n" - printf "\t.n_reg_rules = %d\n", rules - printf "};\n\n" - rules = 0; -} - -/^[ \t]*#/ { - # Ignore -} - -!active && /^[ \t]*$/ { - # Ignore -} - -!active && /country/ { - parse_country_head() -} - -active && /^[ \t]*\(/ { - parse_reg_rule() -} - -active && /^[ \t]*$/ { - print_tail_country() -} - -END { - if (active) - print_tail_country() - print regdb "};" - print "" - print "int reg_regdb_size = ARRAY_SIZE(reg_regdb);" -} diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 180addda52af..ebf8267ffbc9 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -59,7 +59,6 @@ #include "core.h" #include "reg.h" #include "rdev-ops.h" -#include "regdb.h" #include "nl80211.h" /* @@ -495,38 +494,6 @@ static int reg_schedule_apply(const struct ieee80211_regdomain *regdom) return 0; } -#ifdef CONFIG_CFG80211_INTERNAL_REGDB -static int reg_query_builtin(const char *alpha2) -{ - const struct ieee80211_regdomain *regdom = NULL; - unsigned int i; - - for (i = 0; i < reg_regdb_size; i++) { - if (alpha2_equal(alpha2, reg_regdb[i]->alpha2)) { - regdom = reg_copy_regd(reg_regdb[i]); - break; - } - } - if (!regdom) - return -ENODATA; - - return reg_schedule_apply(regdom); -} - -/* Feel free to add any other sanity checks here */ -static void reg_regdb_size_check(void) -{ - /* We should ideally BUILD_BUG_ON() but then random builds would fail */ - WARN_ONCE(!reg_regdb_size, "db.txt is empty, you should update it..."); -} -#else -static inline void reg_regdb_size_check(void) {} -static inline int reg_query_builtin(const char *alpha2) -{ - return -ENODATA; -} -#endif /* CONFIG_CFG80211_INTERNAL_REGDB */ - #ifdef CONFIG_CFG80211_CRDA_SUPPORT /* Max number of consecutive attempts to communicate with CRDA */ #define REG_MAX_CRDA_TIMEOUTS 10 @@ -885,10 +852,6 @@ int reg_reload_regdb(void) static bool reg_query_database(struct regulatory_request *request) { - /* query internal regulatory database (if it exists) */ - if (reg_query_builtin(request->alpha2) == 0) - return true; - if (query_regdb_file(request->alpha2) == 0) return true; @@ -3580,8 +3543,6 @@ int __init regulatory_init(void) spin_lock_init(®_pending_beacons_lock); spin_lock_init(®_indoor_lock); - reg_regdb_size_check(); - rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); user_alpha2[0] = '9'; diff --git a/net/wireless/regdb.h b/net/wireless/regdb.h deleted file mode 100644 index 3279cfcefb0c..000000000000 --- a/net/wireless/regdb.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __REGDB_H__ -#define __REGDB_H__ - -/* - * Copyright 2009 John W. Linville - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -extern const struct ieee80211_regdomain *reg_regdb[]; -extern int reg_regdb_size; - -#endif /* __REGDB_H__ */ -- cgit v1.2.3 From 90a53e4432b12288316efaa5f308adafb8d304b0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 13 Sep 2017 22:21:08 +0200 Subject: cfg80211: implement regdb signature checking Currently CRDA implements the signature checking, and the previous commits added the ability to load the whole regulatory database into the kernel. However, we really can't lose the signature checking, so implement it in the kernel by loading a detached signature (regulatory.db.p7s) and check it against built-in keys. Signed-off-by: Johannes Berg --- net/wireless/.gitignore | 2 + net/wireless/Kconfig | 30 ++++++++++ net/wireless/Makefile | 22 +++++++ net/wireless/certs/sforshee.x509 | Bin 0 -> 680 bytes net/wireless/reg.c | 121 ++++++++++++++++++++++++++++++++++++++- net/wireless/reg.h | 8 +++ 6 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 net/wireless/.gitignore create mode 100644 net/wireless/certs/sforshee.x509 (limited to 'net') diff --git a/net/wireless/.gitignore b/net/wireless/.gitignore new file mode 100644 index 000000000000..61cbc304a3d3 --- /dev/null +++ b/net/wireless/.gitignore @@ -0,0 +1,2 @@ +shipped-certs.c +extra-certs.c diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index f050030055c5..da91bb547db3 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -83,6 +83,36 @@ config CFG80211_CERTIFICATION_ONUS you are a wireless researcher and are working in a controlled and approved environment by your local regulatory agency. +config CFG80211_REQUIRE_SIGNED_REGDB + bool "require regdb signature" if CFG80211_CERTIFICATION_ONUS + default y + select SYSTEM_DATA_VERIFICATION + help + Require that in addition to the "regulatory.db" file a + "regulatory.db.p7s" can be loaded with a valid PKCS#7 + signature for the regulatory.db file made by one of the + keys in the certs/ directory. + +config CFG80211_USE_KERNEL_REGDB_KEYS + bool "allow regdb keys shipped with the kernel" if CFG80211_CERTIFICATION_ONUS + default y + depends on CFG80211_REQUIRE_SIGNED_REGDB + help + Allow the regulatory database to be signed by one of the keys for + which certificates are part of the kernel sources + (in net/wireless/certs/). + + This is currently only Seth Forshee's key, who is the regulatory + database maintainer. + +config CFG80211_EXTRA_REGDB_KEYDIR + string "additional regdb key directory" if CFG80211_CERTIFICATION_ONUS + depends on CFG80211_REQUIRE_SIGNED_REGDB + help + If selected, point to a directory with DER-encoded X.509 + certificates like in the kernel sources (net/wireless/certs/) + that shall be accepted for a signed regulatory database. + config CFG80211_REG_CELLULAR_HINTS bool "cfg80211 regulatory support for cellular base station hints" depends on CFG80211_CERTIFICATION_ONUS diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 5f20dac5d8c6..219baea57e4e 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -16,3 +16,25 @@ cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o CFLAGS_trace.o := -I$(src) + +cfg80211-$(CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS) += shipped-certs.o +ifneq ($(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR),) +cfg80211-y += extra-certs.o +endif + +$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) + @echo " GEN $@" + @echo '#include "reg.h"' > $@ + @echo 'const u8 shipped_regdb_certs[] = {' >> $@ + @for f in $^ ; do hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ ; done + @echo '};' >> $@ + @echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@ + +$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ + $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) + @echo " GEN $@" + @echo '#include "reg.h"' > $@ + @echo 'const u8 extra_regdb_certs[] = {' >> $@ + @for f in $^ ; do test -f $$f && hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ || true ; done + @echo '};' >> $@ + @echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@ diff --git a/net/wireless/certs/sforshee.x509 b/net/wireless/certs/sforshee.x509 new file mode 100644 index 000000000000..c6f8f9d6b988 Binary files /dev/null and b/net/wireless/certs/sforshee.x509 differ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index ebf8267ffbc9..58319c82ecb3 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -659,6 +660,115 @@ static bool valid_country(const u8 *data, unsigned int size, return true; } +#ifdef CONFIG_CFG80211_REQUIRE_SIGNED_REGDB +static struct key *builtin_regdb_keys; + +static void __init load_keys_from_buffer(const u8 *p, unsigned int buflen) +{ + const u8 *end = p + buflen; + size_t plen; + key_ref_t key; + + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(builtin_regdb_keys, 1), + "asymmetric", NULL, p, plen, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(key)) { + pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + } else { + pr_notice("Loaded X.509 cert '%s'\n", + key_ref_to_ptr(key)->description); + key_ref_put(key); + } + p += plen; + } + + return; + +dodgy_cert: + pr_err("Problem parsing in-kernel X.509 certificate list\n"); +} + +static int __init load_builtin_regdb_keys(void) +{ + builtin_regdb_keys = + keyring_alloc(".builtin_regdb_keys", + KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(builtin_regdb_keys)) + return PTR_ERR(builtin_regdb_keys); + + pr_notice("Loading compiled-in X.509 certificates for regulatory database\n"); + +#ifdef CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS + load_keys_from_buffer(shipped_regdb_certs, shipped_regdb_certs_len); +#endif +#ifdef CFG80211_EXTRA_REGDB_KEYDIR + if (CONFIG_CFG80211_EXTRA_REGDB_KEYDIR[0] != '\0') + load_keys_from_buffer(extra_regdb_certs, extra_regdb_certs_len); +#endif + + return 0; +} + +static bool regdb_has_valid_signature(const u8 *data, unsigned int size) +{ + const struct firmware *sig; + bool result; + + if (request_firmware(&sig, "regulatory.db.p7s", ®_pdev->dev)) + return false; + + result = verify_pkcs7_signature(data, size, sig->data, sig->size, + builtin_regdb_keys, + VERIFYING_UNSPECIFIED_SIGNATURE, + NULL, NULL) == 0; + + release_firmware(sig); + + return result; +} + +static void free_regdb_keyring(void) +{ + key_put(builtin_regdb_keys); +} +#else +static int load_builtin_regdb_keys(void) +{ + return 0; +} + +static bool regdb_has_valid_signature(const u8 *data, unsigned int size) +{ + return true; +} + +static void free_regdb_keyring(void) +{ +} +#endif /* CONFIG_CFG80211_REQUIRE_SIGNED_REGDB */ + static bool valid_regdb(const u8 *data, unsigned int size) { const struct fwdb_header *hdr = (void *)data; @@ -673,6 +783,9 @@ static bool valid_regdb(const u8 *data, unsigned int size) if (hdr->version != cpu_to_be32(FWDB_VERSION)) return false; + if (!regdb_has_valid_signature(data, size)) + return false; + country = &hdr->country[0]; while ((u8 *)(country + 1) <= data + size) { if (!country->coll_ptr) @@ -773,7 +886,7 @@ static void regdb_fw_cb(const struct firmware *fw, void *context) pr_info("failed to load regulatory.db\n"); set_error = -ENODATA; } else if (!valid_regdb(fw->data, fw->size)) { - pr_info("loaded regulatory.db is malformed\n"); + pr_info("loaded regulatory.db is malformed or signature is missing/invalid\n"); set_error = -EINVAL; } @@ -3535,6 +3648,10 @@ int __init regulatory_init(void) { int err = 0; + err = load_builtin_regdb_keys(); + if (err) + return err; + reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); if (IS_ERR(reg_pdev)) return PTR_ERR(reg_pdev); @@ -3611,4 +3728,6 @@ void regulatory_exit(void) if (!IS_ERR_OR_NULL(regdb)) kfree(regdb); + + free_regdb_keyring(); } diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 9529c522611a..9ceeb5f3a7cb 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -1,5 +1,8 @@ #ifndef __NET_WIRELESS_REG_H #define __NET_WIRELESS_REG_H + +#include + /* * Copyright 2008-2011 Luis R. Rodriguez * @@ -185,4 +188,9 @@ bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2); */ int reg_reload_regdb(void); +extern const u8 shipped_regdb_certs[]; +extern unsigned int shipped_regdb_certs_len; +extern const u8 extra_regdb_certs[]; +extern unsigned int extra_regdb_certs_len; + #endif /* __NET_WIRELESS_REG_H */ -- cgit v1.2.3 From 9e97964d5e500d8d0df94e1a79ad715ad4f9c995 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 11 Oct 2017 15:46:45 +0200 Subject: mac80211: use crypto_aead_authsize() Evidently this API is intended to be used to isolate against API changes, so use it instead of accessing ->authsize. Signed-off-by: Johannes Berg --- net/mac80211/aead_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/aead_api.c b/net/mac80211/aead_api.c index 347f13953b2c..160f9df30402 100644 --- a/net/mac80211/aead_api.c +++ b/net/mac80211/aead_api.c @@ -21,7 +21,7 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, u8 *data, size_t data_len, u8 *mic) { - size_t mic_len = tfm->authsize; + size_t mic_len = crypto_aead_authsize(tfm); struct scatterlist sg[3]; struct aead_request *aead_req; int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); @@ -52,7 +52,7 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, u8 *data, size_t data_len, u8 *mic) { - size_t mic_len = tfm->authsize; + size_t mic_len = crypto_aead_authsize(tfm); struct scatterlist sg[3]; struct aead_request *aead_req; int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); -- cgit v1.2.3 From ae85bfa87821b9fa60bf846c09a6a0056d87cdb2 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:17 -0700 Subject: net: qrtr: Invoke sk_error_report() after setting sk_err Rather than manually waking up any context sleeping on the sock to signal an error we should call sk_error_report(). This has the added benefit that in-kernel consumers can override this notification with its own callback. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index c2f5c13550c0..7e4b49a8349e 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -541,7 +541,7 @@ static void qrtr_reset_ports(void) sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; - wake_up_interruptible(sk_sleep(&ipc->sk)); + ipc->sk.sk_error_report(&ipc->sk); sock_put(&ipc->sk); } mutex_unlock(&qrtr_port_lock); -- cgit v1.2.3 From 28978713c51b0a70acf748f76f9d6d2d20dcf980 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:18 -0700 Subject: net: qrtr: Move constants to header file The constants are used by both the name server and clients, so clarify their value and move them to the uapi header. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 7e4b49a8349e..15981abc042c 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -61,8 +61,6 @@ struct qrtr_hdr { } __packed; #define QRTR_HDR_SIZE sizeof(struct qrtr_hdr) -#define QRTR_NODE_BCAST ((unsigned int)-1) -#define QRTR_PORT_CTRL ((unsigned int)-2) struct qrtr_sock { /* WARNING: sk must be the first member */ -- cgit v1.2.3 From da7653f0faabbe45eb2d3fd6e4b400fe003e81ae Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:19 -0700 Subject: net: qrtr: Add control packet definition to uapi The QMUX protocol specification defines structure of the special control packet messages being sent between handlers of the control port. Add these to the uapi header, as this structure and the associated types are shared between the kernel and all userspace handlers of control messages. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 15981abc042c..d85ca7170b8f 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -26,18 +26,6 @@ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff -enum qrtr_pkt_type { - QRTR_TYPE_DATA = 1, - QRTR_TYPE_HELLO = 2, - QRTR_TYPE_BYE = 3, - QRTR_TYPE_NEW_SERVER = 4, - QRTR_TYPE_DEL_SERVER = 5, - QRTR_TYPE_DEL_CLIENT = 6, - QRTR_TYPE_RESUME_TX = 7, - QRTR_TYPE_EXIT = 8, - QRTR_TYPE_PING = 9, -}; - /** * struct qrtr_hdr - (I|R)PCrouter packet header * @version: protocol version -- cgit v1.2.3 From e7044482c8ac5081f4775063995647787d5082a4 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:20 -0700 Subject: net: qrtr: Pass source and destination to enqueue functions Defer writing the message header to the skb until its time to enqueue the packet. As the receive path is reworked to decode the message header as it's received from the transport and only pass around the payload in the skb this change means that we do not have to fill out the full message header just to decode it immediately in qrtr_local_enqueue(). In the future this change also makes it possible to prepend message headers based on the version of each link. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 120 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 69 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index d85ca7170b8f..82dc83789310 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -97,8 +97,12 @@ struct qrtr_node { struct list_head item; }; -static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb); -static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb); +static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to); +static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to); /* Release node resources and free the node. * @@ -136,10 +140,27 @@ static void qrtr_node_release(struct qrtr_node *node) } /* Pass an outgoing packet socket buffer to the endpoint driver. */ -static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb) +static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to) { + struct qrtr_hdr *hdr; + size_t len = skb->len; int rc = -ENODEV; + hdr = skb_push(skb, QRTR_HDR_SIZE); + hdr->version = cpu_to_le32(QRTR_PROTO_VER); + hdr->type = cpu_to_le32(type); + hdr->src_node_id = cpu_to_le32(from->sq_node); + hdr->src_port_id = cpu_to_le32(from->sq_port); + hdr->dst_node_id = cpu_to_le32(to->sq_node); + hdr->dst_port_id = cpu_to_le32(to->sq_port); + + hdr->size = cpu_to_le32(len); + hdr->confirm_rx = 0; + + skb_put_padto(skb, ALIGN(len, 4)); + mutex_lock(&node->ep_lock); if (node->ep) rc = node->ep->xmit(node->ep, skb); @@ -237,23 +258,13 @@ EXPORT_SYMBOL_GPL(qrtr_endpoint_post); static struct sk_buff *qrtr_alloc_ctrl_packet(u32 type, size_t pkt_len, u32 src_node, u32 dst_node) { - struct qrtr_hdr *hdr; struct sk_buff *skb; skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); if (!skb) return NULL; - skb_reset_transport_header(skb); - hdr = skb_put(skb, QRTR_HDR_SIZE); - hdr->version = cpu_to_le32(QRTR_PROTO_VER); - hdr->type = cpu_to_le32(type); - hdr->src_node_id = cpu_to_le32(src_node); - hdr->src_port_id = cpu_to_le32(QRTR_PORT_CTRL); - hdr->confirm_rx = cpu_to_le32(0); - hdr->size = cpu_to_le32(pkt_len); - hdr->dst_node_id = cpu_to_le32(dst_node); - hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); + skb_reserve(skb, QRTR_HDR_SIZE); return skb; } @@ -326,6 +337,8 @@ static void qrtr_port_put(struct qrtr_sock *ipc); static void qrtr_node_rx_work(struct work_struct *work) { struct qrtr_node *node = container_of(work, struct qrtr_node, work); + struct sockaddr_qrtr dst; + struct sockaddr_qrtr src; struct sk_buff *skb; while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { @@ -341,6 +354,11 @@ static void qrtr_node_rx_work(struct work_struct *work) dst_port = le32_to_cpu(phdr->dst_port_id); confirm = !!phdr->confirm_rx; + src.sq_node = src_node; + src.sq_port = le32_to_cpu(phdr->src_port_id); + dst.sq_node = dst_node; + dst.sq_port = dst_port; + qrtr_node_assign(node, src_node); ipc = qrtr_port_lookup(dst_port); @@ -357,7 +375,9 @@ static void qrtr_node_rx_work(struct work_struct *work) skb = qrtr_alloc_resume_tx(dst_node, node->nid, dst_port); if (!skb) break; - if (qrtr_node_enqueue(node, skb)) + + if (qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, + &dst, &src)) break; } } @@ -407,6 +427,8 @@ EXPORT_SYMBOL_GPL(qrtr_endpoint_register); void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) { struct qrtr_node *node = ep->node; + struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; + struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; struct sk_buff *skb; mutex_lock(&node->ep_lock); @@ -416,7 +438,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) /* Notify the local controller about the event */ skb = qrtr_alloc_local_bye(node->nid); if (skb) - qrtr_local_enqueue(NULL, skb); + qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); qrtr_node_release(node); ep->node = NULL; @@ -454,11 +476,17 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) { struct sk_buff *skb; int port = ipc->us.sq_port; + struct sockaddr_qrtr to; + + to.sq_family = AF_QIPCRTR; + to.sq_node = QRTR_NODE_BCAST; + to.sq_port = QRTR_PORT_CTRL; skb = qrtr_alloc_del_client(&ipc->us); if (skb) { skb_set_owner_w(skb, &ipc->sk); - qrtr_bcast_enqueue(NULL, skb); + qrtr_bcast_enqueue(NULL, skb, QRTR_TYPE_DEL_CLIENT, &ipc->us, + &to); } if (port == QRTR_PORT_CTRL) @@ -606,19 +634,25 @@ static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) } /* Queue packet to local peer socket. */ -static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb) +static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to) { - const struct qrtr_hdr *phdr; struct qrtr_sock *ipc; + struct qrtr_hdr *phdr; - phdr = (const struct qrtr_hdr *)skb_transport_header(skb); - - ipc = qrtr_port_lookup(le32_to_cpu(phdr->dst_port_id)); + ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ kfree_skb(skb); return -ENODEV; } + phdr = skb_push(skb, QRTR_HDR_SIZE); + skb_reset_transport_header(skb); + + phdr->src_node_id = cpu_to_le32(from->sq_node); + phdr->src_port_id = cpu_to_le32(from->sq_port); + if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); kfree_skb(skb); @@ -631,7 +665,9 @@ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb) } /* Queue packet for broadcast. */ -static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) +static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to) { struct sk_buff *skbn; @@ -641,11 +677,11 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) if (!skbn) break; skb_set_owner_w(skbn, skb->sk); - qrtr_node_enqueue(node, skbn); + qrtr_node_enqueue(node, skbn, type, from, to); } mutex_unlock(&qrtr_node_lock); - qrtr_local_enqueue(node, skb); + qrtr_local_enqueue(node, skb, type, from, to); return 0; } @@ -653,13 +689,14 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); - int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *); + int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *, int, + struct sockaddr_qrtr *, struct sockaddr_qrtr *); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct qrtr_node *node; - struct qrtr_hdr *hdr; struct sk_buff *skb; size_t plen; + u32 type = QRTR_TYPE_DATA; int rc; if (msg->msg_flags & ~(MSG_DONTWAIT)) @@ -713,32 +750,14 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (!skb) goto out_node; - skb_reset_transport_header(skb); - skb_put(skb, len + QRTR_HDR_SIZE); - - hdr = (struct qrtr_hdr *)skb_transport_header(skb); - hdr->version = cpu_to_le32(QRTR_PROTO_VER); - hdr->src_node_id = cpu_to_le32(ipc->us.sq_node); - hdr->src_port_id = cpu_to_le32(ipc->us.sq_port); - hdr->confirm_rx = cpu_to_le32(0); - hdr->size = cpu_to_le32(len); - hdr->dst_node_id = cpu_to_le32(addr->sq_node); - hdr->dst_port_id = cpu_to_le32(addr->sq_port); + skb_reserve(skb, QRTR_HDR_SIZE); - rc = skb_copy_datagram_from_iter(skb, QRTR_HDR_SIZE, - &msg->msg_iter, len); + rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc) { kfree_skb(skb); goto out_node; } - if (plen != len) { - rc = skb_pad(skb, plen - len); - if (rc) - goto out_node; - skb_put(skb, plen - len); - } - if (ipc->us.sq_port == QRTR_PORT_CTRL) { if (len < 4) { rc = -EINVAL; @@ -747,12 +766,11 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } /* control messages already require the type as 'command' */ - skb_copy_bits(skb, QRTR_HDR_SIZE, &hdr->type, 4); - } else { - hdr->type = cpu_to_le32(QRTR_TYPE_DATA); + skb_copy_bits(skb, 0, &type, 4); + type = le32_to_cpu(type); } - rc = enqueue_fn(node, skb); + rc = enqueue_fn(node, skb, type, &ipc->us, addr); if (rc >= 0) rc = len; -- cgit v1.2.3 From 1a7959c76641674991ba3a49f25432f1e105391e Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:21 -0700 Subject: net: qrtr: Clean up control packet handling As the message header generation is deferred the internal functions for generating control packets can be simplified. This patch modifies qrtr_alloc_ctrl_packet() to, in addition to the sk_buff, return a reference to a struct qrtr_ctrl_pkt, which clarifies and simplifies the helpers to the point that these functions can be folded back into the callers. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 93 ++++++++++++++++++--------------------------------------- 1 file changed, 29 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 82dc83789310..a84edba7b1ef 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -255,9 +255,18 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) } EXPORT_SYMBOL_GPL(qrtr_endpoint_post); -static struct sk_buff *qrtr_alloc_ctrl_packet(u32 type, size_t pkt_len, - u32 src_node, u32 dst_node) +/** + * qrtr_alloc_ctrl_packet() - allocate control packet skb + * @pkt: reference to qrtr_ctrl_pkt pointer + * + * Returns newly allocated sk_buff, or NULL on failure + * + * This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and + * on success returns a reference to the control packet in @pkt. + */ +static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt) { + const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); @@ -265,64 +274,7 @@ static struct sk_buff *qrtr_alloc_ctrl_packet(u32 type, size_t pkt_len, return NULL; skb_reserve(skb, QRTR_HDR_SIZE); - - return skb; -} - -/* Allocate and construct a resume-tx packet. */ -static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, - u32 dst_node, u32 port) -{ - const int pkt_len = 20; - struct sk_buff *skb; - __le32 *buf; - - skb = qrtr_alloc_ctrl_packet(QRTR_TYPE_RESUME_TX, pkt_len, - src_node, dst_node); - if (!skb) - return NULL; - - buf = skb_put_zero(skb, pkt_len); - buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); - buf[1] = cpu_to_le32(src_node); - buf[2] = cpu_to_le32(port); - - return skb; -} - -/* Allocate and construct a BYE message to signal remote termination */ -static struct sk_buff *qrtr_alloc_local_bye(u32 src_node) -{ - const int pkt_len = 20; - struct sk_buff *skb; - __le32 *buf; - - skb = qrtr_alloc_ctrl_packet(QRTR_TYPE_BYE, pkt_len, - src_node, qrtr_local_nid); - if (!skb) - return NULL; - - buf = skb_put_zero(skb, pkt_len); - buf[0] = cpu_to_le32(QRTR_TYPE_BYE); - - return skb; -} - -static struct sk_buff *qrtr_alloc_del_client(struct sockaddr_qrtr *sq) -{ - const int pkt_len = 20; - struct sk_buff *skb; - __le32 *buf; - - skb = qrtr_alloc_ctrl_packet(QRTR_TYPE_DEL_CLIENT, pkt_len, - sq->sq_node, QRTR_NODE_BCAST); - if (!skb) - return NULL; - - buf = skb_put_zero(skb, pkt_len); - buf[0] = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); - buf[1] = cpu_to_le32(sq->sq_node); - buf[2] = cpu_to_le32(sq->sq_port); + *pkt = skb_put_zero(skb, pkt_len); return skb; } @@ -337,6 +289,7 @@ static void qrtr_port_put(struct qrtr_sock *ipc); static void qrtr_node_rx_work(struct work_struct *work) { struct qrtr_node *node = container_of(work, struct qrtr_node, work); + struct qrtr_ctrl_pkt *pkt; struct sockaddr_qrtr dst; struct sockaddr_qrtr src; struct sk_buff *skb; @@ -372,10 +325,14 @@ static void qrtr_node_rx_work(struct work_struct *work) } if (confirm) { - skb = qrtr_alloc_resume_tx(dst_node, node->nid, dst_port); + skb = qrtr_alloc_ctrl_packet(&pkt); if (!skb) break; + pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); + pkt->client.node = cpu_to_le32(dst.sq_node); + pkt->client.port = cpu_to_le32(dst.sq_port); + if (qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, &dst, &src)) break; @@ -429,6 +386,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) struct qrtr_node *node = ep->node; struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; + struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; mutex_lock(&node->ep_lock); @@ -436,9 +394,11 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) mutex_unlock(&node->ep_lock); /* Notify the local controller about the event */ - skb = qrtr_alloc_local_bye(node->nid); - if (skb) + skb = qrtr_alloc_ctrl_packet(&pkt); + if (skb) { + pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); + } qrtr_node_release(node); ep->node = NULL; @@ -474,6 +434,7 @@ static void qrtr_port_put(struct qrtr_sock *ipc) /* Remove port assignment. */ static void qrtr_port_remove(struct qrtr_sock *ipc) { + struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; int port = ipc->us.sq_port; struct sockaddr_qrtr to; @@ -482,8 +443,12 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) to.sq_node = QRTR_NODE_BCAST; to.sq_port = QRTR_PORT_CTRL; - skb = qrtr_alloc_del_client(&ipc->us); + skb = qrtr_alloc_ctrl_packet(&pkt); if (skb) { + pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); + pkt->client.node = cpu_to_le32(ipc->us.sq_node); + pkt->client.port = cpu_to_le32(ipc->us.sq_port); + skb_set_owner_w(skb, &ipc->sk); qrtr_bcast_enqueue(NULL, skb, QRTR_TYPE_DEL_CLIENT, &ipc->us, &to); -- cgit v1.2.3 From f507a9b6e63b9f41b61d199c36ae046d17b5fe4b Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:22 -0700 Subject: net: qrtr: Use sk_buff->cb in receive path Rather than parsing the header of incoming messages throughout the implementation do it once when we retrieve the message and store the relevant information in the "cb" member of the sk_buff. This allows us to, in a later commit, decode version 2 messages into this same structure. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 70 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index a84edba7b1ef..7bca6ec892a5 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -48,6 +48,16 @@ struct qrtr_hdr { __le32 dst_port_id; } __packed; +struct qrtr_cb { + u32 src_node; + u32 src_port; + u32 dst_node; + u32 dst_port; + + u8 type; + u8 confirm_rx; +}; + #define QRTR_HDR_SIZE sizeof(struct qrtr_hdr) struct qrtr_sock { @@ -216,6 +226,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) struct qrtr_node *node = ep->node; const struct qrtr_hdr *phdr = data; struct sk_buff *skb; + struct qrtr_cb *cb; unsigned int psize; unsigned int size; unsigned int type; @@ -245,8 +256,15 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (!skb) return -ENOMEM; - skb_reset_transport_header(skb); - skb_put_data(skb, data, len); + cb = (struct qrtr_cb *)skb->cb; + cb->src_node = le32_to_cpu(phdr->src_node_id); + cb->src_port = le32_to_cpu(phdr->src_port_id); + cb->dst_node = le32_to_cpu(phdr->dst_node_id); + cb->dst_port = le32_to_cpu(phdr->dst_port_id); + cb->type = type; + cb->confirm_rx = !!phdr->confirm_rx; + + skb_put_data(skb, data + QRTR_HDR_SIZE, size); skb_queue_tail(&node->rx_queue, skb); schedule_work(&node->work); @@ -295,26 +313,20 @@ static void qrtr_node_rx_work(struct work_struct *work) struct sk_buff *skb; while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { - const struct qrtr_hdr *phdr; - u32 dst_node, dst_port; struct qrtr_sock *ipc; - u32 src_node; + struct qrtr_cb *cb; int confirm; - phdr = (const struct qrtr_hdr *)skb_transport_header(skb); - src_node = le32_to_cpu(phdr->src_node_id); - dst_node = le32_to_cpu(phdr->dst_node_id); - dst_port = le32_to_cpu(phdr->dst_port_id); - confirm = !!phdr->confirm_rx; + cb = (struct qrtr_cb *)skb->cb; + src.sq_node = cb->src_node; + src.sq_port = cb->src_port; + dst.sq_node = cb->dst_node; + dst.sq_port = cb->dst_port; + confirm = !!cb->confirm_rx; - src.sq_node = src_node; - src.sq_port = le32_to_cpu(phdr->src_port_id); - dst.sq_node = dst_node; - dst.sq_port = dst_port; + qrtr_node_assign(node, cb->src_node); - qrtr_node_assign(node, src_node); - - ipc = qrtr_port_lookup(dst_port); + ipc = qrtr_port_lookup(cb->dst_port); if (!ipc) { kfree_skb(skb); } else { @@ -604,7 +616,7 @@ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, struct sockaddr_qrtr *to) { struct qrtr_sock *ipc; - struct qrtr_hdr *phdr; + struct qrtr_cb *cb; ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ @@ -612,11 +624,9 @@ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, return -ENODEV; } - phdr = skb_push(skb, QRTR_HDR_SIZE); - skb_reset_transport_header(skb); - - phdr->src_node_id = cpu_to_le32(from->sq_node); - phdr->src_port_id = cpu_to_le32(from->sq_port); + cb = (struct qrtr_cb *)skb->cb; + cb->src_node = from->sq_node; + cb->src_port = from->sq_port; if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); @@ -750,9 +760,9 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); - const struct qrtr_hdr *phdr; struct sock *sk = sock->sk; struct sk_buff *skb; + struct qrtr_cb *cb; int copied, rc; lock_sock(sk); @@ -769,22 +779,22 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, return rc; } - phdr = (const struct qrtr_hdr *)skb_transport_header(skb); - copied = le32_to_cpu(phdr->size); + copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } - rc = skb_copy_datagram_msg(skb, QRTR_HDR_SIZE, msg, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc < 0) goto out; rc = copied; if (addr) { + cb = (struct qrtr_cb *)skb->cb; addr->sq_family = AF_QIPCRTR; - addr->sq_node = le32_to_cpu(phdr->src_node_id); - addr->sq_port = le32_to_cpu(phdr->src_port_id); + addr->sq_node = cb->src_node; + addr->sq_port = cb->src_port; msg->msg_namelen = sizeof(*addr); } @@ -877,7 +887,7 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: skb = skb_peek(&sk->sk_receive_queue); if (skb) - len = skb->len - QRTR_HDR_SIZE; + len = skb->len; rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: -- cgit v1.2.3 From 194ccc88297ae78d0803adad83c6dcc369787c9e Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:23 -0700 Subject: net: qrtr: Support decoding incoming v2 packets Add the necessary logic for decoding incoming messages of version 2 as well. Also make sure there's room for the bigger of version 1 and 2 headers in the code allocating skbs for outgoing messages. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 132 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 94 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 7bca6ec892a5..e458ece96d3d 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -20,14 +20,15 @@ #include "qrtr.h" -#define QRTR_PROTO_VER 1 +#define QRTR_PROTO_VER_1 1 +#define QRTR_PROTO_VER_2 3 /* auto-bind range */ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff /** - * struct qrtr_hdr - (I|R)PCrouter packet header + * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @src_node_id: source node @@ -37,7 +38,7 @@ * @dst_node_id: destination node * @dst_port_id: destination port */ -struct qrtr_hdr { +struct qrtr_hdr_v1 { __le32 version; __le32 type; __le32 src_node_id; @@ -48,6 +49,32 @@ struct qrtr_hdr { __le32 dst_port_id; } __packed; +/** + * struct qrtr_hdr_v2 - (I|R)PCrouter packet header later versions + * @version: protocol version + * @type: packet type; one of QRTR_TYPE_* + * @flags: bitmask of QRTR_FLAGS_* + * @optlen: length of optional header data + * @size: length of packet, excluding this header and optlen + * @src_node_id: source node + * @src_port_id: source port + * @dst_node_id: destination node + * @dst_port_id: destination port + */ +struct qrtr_hdr_v2 { + u8 version; + u8 type; + u8 flags; + u8 optlen; + __le32 size; + __le16 src_node_id; + __le16 src_port_id; + __le16 dst_node_id; + __le16 dst_port_id; +}; + +#define QRTR_FLAGS_CONFIRM_RX BIT(0) + struct qrtr_cb { u32 src_node; u32 src_port; @@ -58,7 +85,8 @@ struct qrtr_cb { u8 confirm_rx; }; -#define QRTR_HDR_SIZE sizeof(struct qrtr_hdr) +#define QRTR_HDR_MAX_SIZE max_t(size_t, sizeof(struct qrtr_hdr_v1), \ + sizeof(struct qrtr_hdr_v2)) struct qrtr_sock { /* WARNING: sk must be the first member */ @@ -154,12 +182,12 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { - struct qrtr_hdr *hdr; + struct qrtr_hdr_v1 *hdr; size_t len = skb->len; int rc = -ENODEV; - hdr = skb_push(skb, QRTR_HDR_SIZE); - hdr->version = cpu_to_le32(QRTR_PROTO_VER); + hdr = skb_push(skb, sizeof(*hdr)); + hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); hdr->type = cpu_to_le32(type); hdr->src_node_id = cpu_to_le32(from->sq_node); hdr->src_port_id = cpu_to_le32(from->sq_port); @@ -224,52 +252,80 @@ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) { struct qrtr_node *node = ep->node; - const struct qrtr_hdr *phdr = data; + const struct qrtr_hdr_v1 *v1; + const struct qrtr_hdr_v2 *v2; struct sk_buff *skb; struct qrtr_cb *cb; - unsigned int psize; unsigned int size; - unsigned int type; unsigned int ver; - unsigned int dst; + size_t hdrlen; - if (len < QRTR_HDR_SIZE || len & 3) + if (len & 3) return -EINVAL; - ver = le32_to_cpu(phdr->version); - size = le32_to_cpu(phdr->size); - type = le32_to_cpu(phdr->type); - dst = le32_to_cpu(phdr->dst_port_id); + skb = netdev_alloc_skb(NULL, len); + if (!skb) + return -ENOMEM; - psize = (size + 3) & ~3; + cb = (struct qrtr_cb *)skb->cb; - if (ver != QRTR_PROTO_VER) - return -EINVAL; + /* Version field in v1 is little endian, so this works for both cases */ + ver = *(u8*)data; - if (len != psize + QRTR_HDR_SIZE) - return -EINVAL; + switch (ver) { + case QRTR_PROTO_VER_1: + v1 = data; + hdrlen = sizeof(*v1); - if (dst != QRTR_PORT_CTRL && type != QRTR_TYPE_DATA) - return -EINVAL; + cb->type = le32_to_cpu(v1->type); + cb->src_node = le32_to_cpu(v1->src_node_id); + cb->src_port = le32_to_cpu(v1->src_port_id); + cb->confirm_rx = !!v1->confirm_rx; + cb->dst_node = le32_to_cpu(v1->dst_node_id); + cb->dst_port = le32_to_cpu(v1->dst_port_id); - skb = netdev_alloc_skb(NULL, len); - if (!skb) - return -ENOMEM; + size = le32_to_cpu(v1->size); + break; + case QRTR_PROTO_VER_2: + v2 = data; + hdrlen = sizeof(*v2) + v2->optlen; + + cb->type = v2->type; + cb->confirm_rx = !!(v2->flags & QRTR_FLAGS_CONFIRM_RX); + cb->src_node = le16_to_cpu(v2->src_node_id); + cb->src_port = le16_to_cpu(v2->src_port_id); + cb->dst_node = le16_to_cpu(v2->dst_node_id); + cb->dst_port = le16_to_cpu(v2->dst_port_id); + + if (cb->src_port == (u16)QRTR_PORT_CTRL) + cb->src_port = QRTR_PORT_CTRL; + if (cb->dst_port == (u16)QRTR_PORT_CTRL) + cb->dst_port = QRTR_PORT_CTRL; + + size = le32_to_cpu(v2->size); + break; + default: + pr_err("qrtr: Invalid version %d\n", ver); + goto err; + } - cb = (struct qrtr_cb *)skb->cb; - cb->src_node = le32_to_cpu(phdr->src_node_id); - cb->src_port = le32_to_cpu(phdr->src_port_id); - cb->dst_node = le32_to_cpu(phdr->dst_node_id); - cb->dst_port = le32_to_cpu(phdr->dst_port_id); - cb->type = type; - cb->confirm_rx = !!phdr->confirm_rx; + if (len != ALIGN(size, 4) + hdrlen) + goto err; - skb_put_data(skb, data + QRTR_HDR_SIZE, size); + if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA) + goto err; + + skb_put_data(skb, data + hdrlen, size); skb_queue_tail(&node->rx_queue, skb); schedule_work(&node->work); return 0; + +err: + kfree_skb(skb); + return -EINVAL; + } EXPORT_SYMBOL_GPL(qrtr_endpoint_post); @@ -287,11 +343,11 @@ static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt) const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; - skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); + skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, GFP_KERNEL); if (!skb) return NULL; - skb_reserve(skb, QRTR_HDR_SIZE); + skb_reserve(skb, QRTR_HDR_MAX_SIZE); *pkt = skb_put_zero(skb, pkt_len); return skb; @@ -720,12 +776,12 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } plen = (len + 3) & ~3; - skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_SIZE, + skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_MAX_SIZE, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) goto out_node; - skb_reserve(skb, QRTR_HDR_SIZE); + skb_reserve(skb, QRTR_HDR_MAX_SIZE); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc) { -- cgit v1.2.3 From 843e79d05addd8eb06992cd6dfafc7b9d53f2bc8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:07 +0200 Subject: net: sched: make tc_action_ops->get_dev return dev and avoid passing net Return dev directly, NULL if not possible. That is enough. Makes no sense to pass struct net * to get_dev op, as there is only one net possible, the one the action was created in. So just store it in mirred priv and use directly. Rename the mirred op callback function. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 13 +++++-------- net/sched/cls_api.c | 6 ++---- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 416627c66f08..8b3e59388480 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -140,6 +140,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, m->tcfm_eaction = parm->eaction; if (dev != NULL) { m->tcfm_ifindex = parm->ifindex; + m->net = net; if (ret != ACT_P_CREATED) dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); @@ -313,15 +314,11 @@ static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; -static int tcf_mirred_device(const struct tc_action *a, struct net *net, - struct net_device **mirred_dev) +static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { - int ifindex = tcf_mirred_ifindex(a); + struct tcf_mirred *m = to_mirred(a); - *mirred_dev = __dev_get_by_index(net, ifindex); - if (!*mirred_dev) - return -EINVAL; - return 0; + return __dev_get_by_index(m->net, m->tcfm_ifindex); } static struct tc_action_ops act_mirred_ops = { @@ -336,7 +333,7 @@ static struct tc_action_ops act_mirred_ops = { .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), - .get_dev = tcf_mirred_device, + .get_dev = tcf_mirred_get_dev, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0b2219adf520..450873b0c4b9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1016,10 +1016,8 @@ int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, tcf_exts_to_list(exts, &actions); list_for_each_entry(a, &actions, list) { - if (a->ops->get_dev) { - a->ops->get_dev(a, dev_net(dev), hw_dev); - break; - } + if (a->ops->get_dev) + *hw_dev = a->ops->get_dev(a); } if (*hw_dev) return 0; -- cgit v1.2.3 From b3f55bdda8df55a563005e00b1b71212d8546541 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:08 +0200 Subject: net: sched: introduce per-egress action device callbacks Introduce infrastructure that allows drivers to register callbacks that are called whenever tc would offload inserted rule and specified device acts as tc action egress device. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/sched/cls_api.c | 30 +++++++ 2 files changed, 250 insertions(+) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index da6fa82c98a8..ac97db92ab68 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include #include @@ -1249,8 +1251,226 @@ out_module_put: return skb->len; } +struct tcf_action_net { + struct rhashtable egdev_ht; +}; + +static unsigned int tcf_action_net_id; + +struct tcf_action_egdev_cb { + struct list_head list; + tc_setup_cb_t *cb; + void *cb_priv; +}; + +struct tcf_action_egdev { + struct rhash_head ht_node; + const struct net_device *dev; + unsigned int refcnt; + struct list_head cb_list; +}; + +static const struct rhashtable_params tcf_action_egdev_ht_params = { + .key_offset = offsetof(struct tcf_action_egdev, dev), + .head_offset = offsetof(struct tcf_action_egdev, ht_node), + .key_len = sizeof(const struct net_device *), +}; + +static struct tcf_action_egdev * +tcf_action_egdev_lookup(const struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + return rhashtable_lookup_fast(&tan->egdev_ht, &dev, + tcf_action_egdev_ht_params); +} + +static struct tcf_action_egdev * +tcf_action_egdev_get(const struct net_device *dev) +{ + struct tcf_action_egdev *egdev; + struct tcf_action_net *tan; + + egdev = tcf_action_egdev_lookup(dev); + if (egdev) + goto inc_ref; + + egdev = kzalloc(sizeof(*egdev), GFP_KERNEL); + if (!egdev) + return NULL; + INIT_LIST_HEAD(&egdev->cb_list); + tan = net_generic(dev_net(dev), tcf_action_net_id); + rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node, + tcf_action_egdev_ht_params); + +inc_ref: + egdev->refcnt++; + return egdev; +} + +static void tcf_action_egdev_put(struct tcf_action_egdev *egdev) +{ + struct tcf_action_net *tan; + + if (--egdev->refcnt) + return; + tan = net_generic(dev_net(egdev->dev), tcf_action_net_id); + rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node, + tcf_action_egdev_ht_params); + kfree(egdev); +} + +static struct tcf_action_egdev_cb * +tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + list_for_each_entry(egdev_cb, &egdev->cb_list, list) + if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv) + return egdev_cb; + return NULL; +} + +static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev, + enum tc_setup_type type, + void *type_data, bool err_stop) +{ + struct tcf_action_egdev_cb *egdev_cb; + int ok_count = 0; + int err; + + list_for_each_entry(egdev_cb, &egdev->cb_list, list) { + err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv); + if (err) { + if (err_stop) + return err; + } else { + ok_count++; + } + } + return ok_count; +} + +static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); + if (WARN_ON(egdev_cb)) + return -EEXIST; + egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL); + if (!egdev_cb) + return -ENOMEM; + egdev_cb->cb = cb; + egdev_cb->cb_priv = cb_priv; + list_add(&egdev_cb->list, &egdev->cb_list); + return 0; +} + +static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); + if (WARN_ON(!egdev_cb)) + return; + list_del(&egdev_cb->list); + kfree(egdev_cb); +} + +static int __tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev); + int err; + + if (!egdev) + return -ENOMEM; + err = tcf_action_egdev_cb_add(egdev, cb, cb_priv); + if (err) + goto err_cb_add; + return 0; + +err_cb_add: + tcf_action_egdev_put(egdev); + return err; +} +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + int err; + + rtnl_lock(); + err = __tc_setup_cb_egdev_register(dev, cb, cb_priv); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register); + +static void __tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); + + if (WARN_ON(!egdev)) + return; + tcf_action_egdev_cb_del(egdev, cb, cb_priv); + tcf_action_egdev_put(egdev); +} +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + rtnl_lock(); + __tc_setup_cb_egdev_unregister(dev, cb, cb_priv); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister); + +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); + + if (!egdev) + return 0; + return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop); +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call); + +static __net_init int tcf_action_net_init(struct net *net) +{ + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params); +} + +static void __net_exit tcf_action_net_exit(struct net *net) +{ + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + rhashtable_destroy(&tan->egdev_ht); +} + +static struct pernet_operations tcf_action_net_ops = { + .init = tcf_action_net_init, + .exit = tcf_action_net_exit, + .id = &tcf_action_net_id, + .size = sizeof(struct tcf_action_net), +}; + static int __init tc_action_init(void) { + int err; + + err = register_pernet_subsys(&tcf_action_net_ops); + if (err) + return err; + rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 450873b0c4b9..99f9432f63cf 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1026,6 +1026,36 @@ int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, } EXPORT_SYMBOL(tcf_exts_get_dev); +int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type, + void *type_data, bool err_stop) +{ + int ok_count = 0; +#ifdef CONFIG_NET_CLS_ACT + const struct tc_action *a; + struct net_device *dev; + LIST_HEAD(actions); + int ret; + + if (!tcf_exts_has_actions(exts)) + return 0; + + tcf_exts_to_list(exts, &actions); + list_for_each_entry(a, &actions, list) { + if (!a->ops->get_dev) + continue; + dev = a->ops->get_dev(a); + if (!dev || !tc_can_offload(dev)) + continue; + ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); + if (ret < 0) + return ret; + ok_count += ret; + } +#endif + return ok_count; +} +EXPORT_SYMBOL(tcf_exts_egdev_cb_call); + static int __init tc_filter_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0); -- cgit v1.2.3 From 717503b9cf57c0bb7ea4d3a9f5699c9a04adf988 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:09 +0200 Subject: net: sched: convert cls_flower->egress_dev users to tc_setup_cb_egdev infra The only user of cls_flower->egress_dev is mlx5. So do the conversion there alongside with the code originating the call in cls_flower function fl_hw_replace_filter to the newly introduced egress device callback infrastucture. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 13 ++++++++--- net/sched/cls_flower.c | 63 +++++++++++++++++++++++++++----------------------- 2 files changed, 44 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 99f9432f63cf..51994a202585 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1026,8 +1026,9 @@ int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, } EXPORT_SYMBOL(tcf_exts_get_dev); -int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type, - void *type_data, bool err_stop) +static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, + enum tc_setup_type type, + void *type_data, bool err_stop) { int ok_count = 0; #ifdef CONFIG_NET_CLS_ACT @@ -1054,7 +1055,13 @@ int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type, #endif return ok_count; } -EXPORT_SYMBOL(tcf_exts_egdev_cb_call); + +int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, + void *type_data, bool err_stop) +{ + return tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop); +} +EXPORT_SYMBOL(tc_setup_cb_call); static int __init tc_filter_init(void) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index db831ac708f6..5b7bb968d1d4 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -88,7 +88,6 @@ struct cls_fl_filter { u32 handle; u32 flags; struct rcu_head rcu; - struct net_device *hw_dev; }; static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) @@ -201,16 +200,17 @@ static void fl_destroy_filter(struct rcu_head *head) static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = f->hw_dev; - - if (!tc_can_offload(dev)) - return; + struct net_device *dev = tp->q->dev_queue->dev; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); + if (tc_can_offload(dev)) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, + &cls_flower); + tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, false); } static int fl_hw_replace_filter(struct tcf_proto *tp, @@ -220,20 +220,9 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload cls_flower = {}; + bool skip_sw = tc_skip_sw(f->flags); int err; - if (!tc_can_offload(dev)) { - if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) || - (f->hw_dev && !tc_can_offload(f->hw_dev))) { - f->hw_dev = dev; - return tc_skip_sw(f->flags) ? -EINVAL : 0; - } - dev = f->hw_dev; - cls_flower.egress_dev = true; - } else { - f->hw_dev = dev; - } - tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_REPLACE; cls_flower.cookie = (unsigned long) f; @@ -242,31 +231,47 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); - if (!err) - f->flags |= TCA_CLS_FLAGS_IN_HW; + if (tc_can_offload(dev)) { + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, + &cls_flower); + if (err) { + if (skip_sw) + return err; + } else { + f->flags |= TCA_CLS_FLAGS_IN_HW; + } + } - if (tc_skip_sw(f->flags)) + err = tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, skip_sw); + if (err < 0) { + fl_hw_destroy_filter(tp, f); return err; + } else if (err > 0) { + f->flags |= TCA_CLS_FLAGS_IN_HW; + } + + if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; + return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = f->hw_dev; - - if (!tc_can_offload(dev)) - return; + struct net_device *dev = tp->q->dev_queue->dev; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.exts = &f->exts; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); + if (tc_can_offload(dev)) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, + &cls_flower); + tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, false); } static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) -- cgit v1.2.3 From 7578d7b45ed870b13a8ace57e32feaed623c2a94 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:10 +0200 Subject: net: sched: remove unused tcf_exts_get_dev helper and cls_flower->egress_dev The helper and the struct field ares no longer used by any code, so remove them. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 51994a202585..2977b8a90851 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1004,28 +1004,6 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_dump_stats); -int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, - struct net_device **hw_dev) -{ -#ifdef CONFIG_NET_CLS_ACT - const struct tc_action *a; - LIST_HEAD(actions); - - if (!tcf_exts_has_actions(exts)) - return -EINVAL; - - tcf_exts_to_list(exts, &actions); - list_for_each_entry(a, &actions, list) { - if (a->ops->get_dev) - *hw_dev = a->ops->get_dev(a); - } - if (*hw_dev) - return 0; -#endif - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(tcf_exts_get_dev); - static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, enum tc_setup_type type, void *type_data, bool err_stop) -- cgit v1.2.3 From 4ea2607f7871ca433ab0c5300289215974213f26 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:28:00 +0200 Subject: ipv6: addrconf: don't use rtnl mutex in RTM_GETNETCONF Instead of relying on rtnl mutex bump device reference count. After this change, values reported can change in parallel, but thats not much different from current state, as anyone can change the settings right after rtnl_unlock (and before userspace processed reply). While at it, switch to GFP_KERNEL allocation. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d9f6226694eb..5207f567ef28 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -616,23 +616,23 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; + struct inet6_dev *in6_dev = NULL; + struct net_device *dev = NULL; struct netconfmsg *ncm; struct sk_buff *skb; struct ipv6_devconf *devconf; - struct inet6_dev *in6_dev; - struct net_device *dev; int ifindex; int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, devconf_ipv6_policy, extack); if (err < 0) - goto errout; + return err; - err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) - goto errout; + return -EINVAL; + err = -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: @@ -642,10 +642,10 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, devconf = net->ipv6.devconf_dflt; break; default: - dev = __dev_get_by_index(net, ifindex); + dev = dev_get_by_index(net, ifindex); if (!dev) - goto errout; - in6_dev = __in6_dev_get(dev); + return -EINVAL; + in6_dev = in6_dev_get(dev); if (!in6_dev) goto errout; devconf = &in6_dev->cnf; @@ -653,7 +653,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); + skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; @@ -669,6 +669,10 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: + if (in6_dev) + in6_dev_put(in6_dev); + if (dev) + dev_put(dev); return err; } @@ -6570,7 +6574,7 @@ int __init addrconf_init(void) __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr, 0); __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, 0); + inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED); ipv6_addr_label_rtnl_register(); -- cgit v1.2.3 From c24675f871d3bca569532b3744962dd49938ddff Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:28:01 +0200 Subject: ipv6: addrconf: don't use rtnl mutex in RTM_GETADDR Similar to the previous patch, use the device lookup functions that bump device refcount and flag this as DOIT_UNLOCKED to avoid rtnl mutex. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5207f567ef28..4603aa488f4f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4890,17 +4890,15 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); if (err < 0) - goto errout; + return err; addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); - if (!addr) { - err = -EINVAL; - goto errout; - } + if (!addr) + return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifa_index) - dev = __dev_get_by_index(net, ifm->ifa_index); + dev = dev_get_by_index(net, ifm->ifa_index); ifa = ipv6_get_ifaddr(net, addr, dev, 1); if (!ifa) { @@ -4926,6 +4924,8 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, errout_ifa: in6_ifa_put(ifa); errout: + if (dev) + dev_put(dev); return err; } @@ -6568,7 +6568,7 @@ int __init addrconf_init(void) __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0); __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0); __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, - inet6_dump_ifaddr, 0); + inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED); __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr, 0); __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, -- cgit v1.2.3 From 1a37b770cf78dca77c386f8a95e05403233e4d52 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Oct 2017 11:17:57 +0100 Subject: sctp: make array sctp_sched_ops static The array sctp_sched_ops is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'sctp_sched_ops' was not declared. Should it be static? Signed-off-by: Colin Ian King Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/stream_sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 03513a9fa110..0b83ec51e43b 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -124,7 +124,7 @@ static struct sctp_sched_ops sctp_sched_fcfs = { extern struct sctp_sched_ops sctp_sched_prio; extern struct sctp_sched_ops sctp_sched_rr; -struct sctp_sched_ops *sctp_sched_ops[] = { +static struct sctp_sched_ops *sctp_sched_ops[] = { &sctp_sched_fcfs, &sctp_sched_prio, &sctp_sched_rr, -- cgit v1.2.3 From 14c68c43b7695cbcfe148bf3c513b598c2886e7d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Oct 2017 10:53:28 +0100 Subject: net: mpls: make function ipgre_mpls_encap_hlen static The function ipgre_mpls_encap_hlen is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'ipgre_mpls_encap_hlen' was not declared. Should it be static? Fixes: bdc476413dcdb ("ip_tunnel: add mpls over gre support") Signed-off-by: Colin Ian King Acked-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 9745e8f69810..8ca9915befc8 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -41,7 +41,7 @@ static int label_limit = (1 << 20) - 1; static int ttl_max = 255; #if IS_ENABLED(CONFIG_NET_IP_TUNNEL) -size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e) +static size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e) { return sizeof(struct mpls_shim_hdr); } -- cgit v1.2.3 From a67a4893f37d5389c812d85ccf1e258b87ef4ead Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Oct 2017 11:23:04 +0200 Subject: cfg80211: remove set but never used variable cf_offset Perhaps it had been intended to be used, but it clearly isn't. Signed-off-by: Johannes Berg --- net/wireless/chan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/chan.c b/net/wireless/chan.c index b8aa5a7d5c77..eb824270f6e3 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -464,7 +464,7 @@ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, struct ieee80211_channel *chan) { int width; - u32 cf_offset, freq; + u32 freq; if (chandef->chan->center_freq == chan->center_freq) return true; @@ -473,8 +473,6 @@ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, if (width <= 20) return false; - cf_offset = width / 2 - 10; - for (freq = chandef->center_freq1 - width / 2 + 10; freq <= chandef->center_freq1 + width / 2 - 10; freq += 20) { if (chan->center_freq == freq) -- cgit v1.2.3 From bad5680b5052f9d3008a8cfcc0b34ba0da08401d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 7 Oct 2017 14:21:22 +0200 Subject: batman-adv: Add missing kerneldoc for extack The parameter extack was added to batadv_softif_slave_add without adding the kernel-doc for it. This caused kernel-doc warnings. Signed-off-by: Sven Eckelmann Acked-by: David Ahern Signed-off-by: Simon Wunderlich --- net/batman-adv/soft-interface.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 543d2c3e0f0d..9f673cdfecf8 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -863,6 +863,7 @@ free_bat_counters: * batadv_softif_slave_add - Add a slave interface to a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface + * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ -- cgit v1.2.3 From 60724d4bae14cd295b27b1610cad9a2720eb0860 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 11 Oct 2017 10:57:48 -0700 Subject: net: dsa: Add support for DSA specific notifiers In preparation for communicating a given DSA network device's port number and switch index, create a specialized DSA notifier and two events: DSA_PORT_REGISTER and DSA_PORT_UNREGISTER that communicate: the slave network device (slave_dev), port number and switch number in the tree. This will be later used for network device drivers like bcmsysport which needs to cooperate with its DSA network devices to set-up queue mapping and scheduling. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 23 +++++++++++++++++++++++ net/dsa/slave.c | 13 +++++++++++++ 2 files changed, 36 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 51ca2a524a27..832c659ff993 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -261,6 +262,28 @@ bool dsa_schedule_work(struct work_struct *work) return queue_work(dsa_owq, work); } +static ATOMIC_NOTIFIER_HEAD(dsa_notif_chain); + +int register_dsa_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&dsa_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(register_dsa_notifier); + +int unregister_dsa_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&dsa_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_dsa_notifier); + +int call_dsa_notifiers(unsigned long val, struct net_device *dev, + struct dsa_notifier_info *info) +{ + info->dev = dev; + return atomic_notifier_call_chain(&dsa_notif_chain, val, info); +} +EXPORT_SYMBOL_GPL(call_dsa_notifiers); + static int __init dsa_init_module(void) { int rc; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index fb2954ff198c..45f4ea845c07 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1116,6 +1116,7 @@ int dsa_slave_resume(struct net_device *slave_dev) int dsa_slave_create(struct dsa_port *port, const char *name) { + struct dsa_notifier_register_info rinfo = { }; struct dsa_switch *ds = port->ds; struct net_device *master; struct net_device *slave_dev; @@ -1177,6 +1178,12 @@ int dsa_slave_create(struct dsa_port *port, const char *name) goto out_free; } + rinfo.info.dev = slave_dev; + rinfo.master = master; + rinfo.port_number = p->dp->index; + rinfo.switch_number = p->dp->ds->index; + call_dsa_notifiers(DSA_PORT_REGISTER, slave_dev, &rinfo.info); + ret = register_netdev(slave_dev); if (ret) { netdev_err(master, "error %d registering interface %s\n", @@ -1200,6 +1207,7 @@ out_free: void dsa_slave_destroy(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); + struct dsa_notifier_register_info rinfo = { }; struct device_node *port_dn; port_dn = p->dp->dn; @@ -1211,6 +1219,11 @@ void dsa_slave_destroy(struct net_device *slave_dev) if (of_phy_is_fixed_link(port_dn)) of_phy_deregister_fixed_link(port_dn); } + rinfo.info.dev = slave_dev; + rinfo.master = p->dp->cpu_dp->netdev; + rinfo.port_number = p->dp->index; + rinfo.switch_number = p->dp->ds->index; + call_dsa_notifiers(DSA_PORT_UNREGISTER, slave_dev, &rinfo.info); unregister_netdev(slave_dev); free_percpu(p->stats64); free_netdev(slave_dev); -- cgit v1.2.3 From 0a5f14ce67a6e093e651d3cd75e6ac281123d93a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 11 Oct 2017 10:57:49 -0700 Subject: net: dsa: tag_brcm: Indicate to master netdevice port + queue We need to tell the DSA master network device doing the actual transmission what the desired switch port and queue number is for it to resolve that to the internal transmit queue it is mapped to. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_brcm.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 8e4bdb9d9ae3..cc4f472fbd77 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -86,6 +86,12 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK; + /* Now tell the master network device about the desired output queue + * as well + */ + skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(p->dp->index, + queue)); + return skb; } -- cgit v1.2.3 From d921c420d2efa5731011b922e5a2f2ad33203c0b Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 11 Oct 2017 13:47:22 +0200 Subject: net/smc: replace function pointer get_netdev() SMC should not open code the function pointer get_netdev of the IB device. Replacing ib_query_gid(..., NULL) with ib_query_gid(..., gid_attr) allows access to the netdev. Signed-off-by: Ursula Braun Suggested-by: Parav Pandit Reviewed-by: Parav Pandit Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 0b5852299158..468e1d725d97 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -369,26 +369,17 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) { - struct net_device *ndev; + struct ib_gid_attr gattr; int rc; rc = ib_query_gid(smcibdev->ibdev, ibport, 0, - &smcibdev->gid[ibport - 1], NULL); - /* the SMC protocol requires specification of the roce MAC address; - * if net_device cannot be determined, it can be derived from gid 0 - */ - ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); - if (ndev) { - memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); - dev_put(ndev); - } else if (!rc) { - memcpy(&smcibdev->mac[ibport - 1][0], - &smcibdev->gid[ibport - 1].raw[8], 3); - memcpy(&smcibdev->mac[ibport - 1][3], - &smcibdev->gid[ibport - 1].raw[13], 3); - smcibdev->mac[ibport - 1][0] &= ~0x02; - } - return rc; + &smcibdev->gid[ibport - 1], &gattr); + if (rc || !gattr.ndev) + return -ENODEV; + + memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); + dev_put(gattr.ndev); + return 0; } /* Create an identifier unique for this instance of SMC-R. @@ -419,6 +410,7 @@ int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) &smcibdev->pattr[ibport - 1]); if (rc) goto out; + /* the SMC protocol requires specification of the RoCE MAC address */ rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); if (rc) goto out; -- cgit v1.2.3 From 43e2ada3e06aefe3596be75bd05b34ef14fd1f7c Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 11 Oct 2017 13:47:23 +0200 Subject: net/smc: dev_put for netdev after usage of ib_query_gid() For RoCEs ib_query_gid() takes a reference count on the net_device. This reference count must be decreased by the caller. Signed-off-by: Ursula Braun Reported-by: Parav Pandit Reviewed-by: Parav Pandit Fixes: 0cfdd8f92cac ("smc: connection and link group creation") Signed-off-by: David S. Miller --- net/smc/smc_core.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 20b66e79c5d6..5f6a20084157 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -380,10 +380,14 @@ static int smc_link_determine_gid(struct smc_link_group *lgr) if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, &gattr)) continue; - if (gattr.ndev && - (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) { - lnk->gid = gid; - return 0; + if (gattr.ndev) { + if (is_vlan_dev(gattr.ndev) && + vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) { + lnk->gid = gid; + dev_put(gattr.ndev); + return 0; + } + dev_put(gattr.ndev); } } return -ENODEV; -- cgit v1.2.3 From 0eb16f82ecd40d4ce344ec882d6fb5c330f8ef39 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Oct 2017 15:55:31 +0200 Subject: ip_tunnel: fix building with NET_IP_TUNNEL=m When af_mpls is built-in but the tunnel support is a module, we get a link failure: net/mpls/af_mpls.o: In function `mpls_init': af_mpls.c:(.init.text+0xdc): undefined reference to `ip_tunnel_encap_add_ops' This adds a Kconfig statement to prevent the broken configuration and force mpls to be a module as well in this case. Fixes: bdc476413dcd ("ip_tunnel: add mpls over gre support") Signed-off-by: Arnd Bergmann Acked-by: Amine Kherbouche Signed-off-by: David S. Miller --- net/mpls/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 5c467ef97311..801ea9098387 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -24,6 +24,7 @@ config NET_MPLS_GSO config MPLS_ROUTING tristate "MPLS: routing support" + depends on NET_IP_TUNNEL || NET_IP_TUNNEL=n ---help--- Add support for forwarding of mpls packets. -- cgit v1.2.3 From 8f04748016f3b583e675e0f649d42cfc10812a8b Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 11 Oct 2017 10:50:29 -0400 Subject: net sched actions: change IFE modules alias names Make style of module alias name consistent with other subsystems in kernel, for example net devices. Fixes: 084e2f6566d2 ("Support to encoding decoding skb mark on IFE action") Fixes: 200e10f46936 ("Support to encoding decoding skb prio on IFE action") Fixes: 408fbc22ef1e ("net sched ife action: Introduce skb tcindex metadata encap decap") Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_ife.c | 2 +- net/sched/act_meta_mark.c | 2 +- net/sched/act_meta_skbprio.c | 2 +- net/sched/act_meta_skbtcindex.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 8ccd35825b6b..791aeee11c7e 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -263,7 +263,7 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, if (exists) spin_unlock_bh(&ife->tcf_lock); rtnl_unlock(); - request_module("ifemeta%u", metaid); + request_module("ife-meta-%u", metaid); rtnl_lock(); if (exists) spin_lock_bh(&ife->tcf_lock); diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c index 82892170ce4f..1e3f10e5da99 100644 --- a/net/sched/act_meta_mark.c +++ b/net/sched/act_meta_mark.c @@ -76,4 +76,4 @@ module_exit(ifemark_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE skb mark metadata module"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_SKBMARK); +MODULE_ALIAS_IFE_META("skbmark"); diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c index 26bf4d86030b..4033f9fc4d4a 100644 --- a/net/sched/act_meta_skbprio.c +++ b/net/sched/act_meta_skbprio.c @@ -73,4 +73,4 @@ module_exit(ifeprio_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE skb prio metadata action"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_PRIO); +MODULE_ALIAS_IFE_META("skbprio"); diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c index 3b35774ce890..2ea1f26c9e96 100644 --- a/net/sched/act_meta_skbtcindex.c +++ b/net/sched/act_meta_skbtcindex.c @@ -76,4 +76,4 @@ module_exit(ifetc_index_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2016)"); MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX); +MODULE_ALIAS_IFE_META("tcindex"); -- cgit v1.2.3 From d3f24ba895f0bbbc8ab0ecb03de7daa6eccc7ceb Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 11 Oct 2017 10:50:30 -0400 Subject: net sched actions: fix module auto-loading Macro __stringify_1() can stringify a macro argument, however IFE_META_* are enums, so they never expand, however request_module expects an integer in IFE module name, so as a result it always fails to auto-load. Fixes: ef6980b6becb ("introduce IFE action") Signed-off-by: Roman Mashak Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_ife.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 791aeee11c7e..e0bc228c1218 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -248,6 +248,20 @@ static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) return ret; } +static const char *ife_meta_id2name(u32 metaid) +{ + switch (metaid) { + case IFE_META_SKBMARK: + return "skbmark"; + case IFE_META_PRIO: + return "skbprio"; + case IFE_META_TCINDEX: + return "tcindex"; + default: + return "unknown"; + } +} + /* called when adding new meta information * under ife->tcf_lock for existing action */ @@ -263,7 +277,7 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, if (exists) spin_unlock_bh(&ife->tcf_lock); rtnl_unlock(); - request_module("ife-meta-%u", metaid); + request_module("ife-meta-%s", ife_meta_id2name(metaid)); rtnl_lock(); if (exists) spin_lock_bh(&ife->tcf_lock); -- cgit v1.2.3 From 734534e9a8e537d33d3598fa03b98eb089819961 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 11 Oct 2017 17:16:06 -0400 Subject: sched: act: ife: move encode/decode check to init This patch adds the check of the two possible ife handlings encode and decode to the init callback. The decode value is for usability aspect and used in userspace code only. The current code offers encode else decode only. This patch avoids any other option than this. Signed-off-by: Alexander Aring Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ife.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index e0bc228c1218..21c10f5d1247 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -464,6 +464,13 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_IFE_PARMS]); + /* IFE_DECODE is 0 and indicates the opposite of IFE_ENCODE because + * they cannot run as the same time. Check on all other values which + * are not supported right now. + */ + if (parm->flags & ~IFE_ENCODE) + return -EINVAL; + exists = tcf_idr_check(tn, parm->index, a, bind); if (exists && bind) return 0; @@ -786,17 +793,7 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, if (ife->flags & IFE_ENCODE) return tcf_ife_encode(skb, a, res); - if (!(ife->flags & IFE_ENCODE)) - return tcf_ife_decode(skb, a, res); - - pr_info_ratelimited("unknown failure(policy neither de/encode\n"); - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); - tcf_lastuse_update(&ife->tcf_tm); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); - - return TC_ACT_SHOT; + return tcf_ife_decode(skb, a, res); } static int tcf_ife_walker(struct net *net, struct sk_buff *skb, -- cgit v1.2.3 From ced273eacfe1876e2c3c4ea1244a2e386e20eadb Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 11 Oct 2017 17:16:07 -0400 Subject: sched: act: ife: migrate to use per-cpu counters This patch migrates the current counter handling which is protected by a spinlock to a per-cpu counter handling. This reduce the time where the spinlock is being held. Signed-off-by: Alexander Aring Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ife.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 21c10f5d1247..f59d78918cf9 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -477,7 +477,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, - bind, false); + bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -638,19 +638,15 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, u8 *tlv_data; u16 metalen; - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); - spin_unlock(&ife->tcf_lock); if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); tlv_data = ife_decode(skb, &metalen); if (unlikely(!tlv_data)) { - spin_lock(&ife->tcf_lock); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -668,14 +664,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, */ pr_info_ratelimited("Unknown metaid %d dlen %d\n", mtype, dlen); - ife->tcf_qstats.overlimits++; + qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); } } if (WARN_ON(tlv_data != ifehdr_end)) { - spin_lock(&ife->tcf_lock); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -727,23 +721,20 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, exceed_mtu = true; } - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet * with no metadata */ - ife->tcf_qstats.overlimits++; - spin_unlock(&ife->tcf_lock); + qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); return action; } /* could be stupid policy setup or mtu config * so lets be conservative.. */ if ((action == TC_ACT_SHOT) || exceed_mtu) { - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -752,6 +743,8 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, ife_meta = ife_encode(skb, metalen); + spin_lock(&ife->tcf_lock); + /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... @@ -763,8 +756,8 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, } if (err < 0) { /* too corrupt to keep around if overwritten */ - ife->tcf_qstats.drops++; spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skboff += err; -- cgit v1.2.3 From aa9fd9a325d51fa0b11153b03b8fefff569fa955 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 11 Oct 2017 17:16:08 -0400 Subject: sched: act: ife: update parameters via rcu handling This patch changes the parameter updating via RCU and not protected by a spinlock anymore. This reduce the time that the spinlock is being held. Signed-off-by: Alexander Aring Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ife.c | 87 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index f59d78918cf9..252ee7d8c731 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -406,10 +406,14 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind) static void tcf_ife_cleanup(struct tc_action *a, int bind) { struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a, bind); spin_unlock_bh(&ife->tcf_lock); + + p = rcu_dereference_protected(ife->params, 1); + kfree_rcu(p, rcu); } /* under ife->tcf_lock for existing action */ @@ -446,6 +450,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; + struct tcf_ife_params *p, *p_old; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; @@ -471,24 +476,34 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (parm->flags & ~IFE_ENCODE) return -EINVAL; + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + exists = tcf_idr_check(tn, parm->index, a, bind); - if (exists && bind) + if (exists && bind) { + kfree(p); return 0; + } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, bind, true); - if (ret) + if (ret) { + kfree(p); return ret; + } ret = ACT_P_CREATED; } else { tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + kfree(p); return -EEXIST; + } } ife = to_ife(*a); - ife->flags = parm->flags; + p->flags = parm->flags; if (parm->flags & IFE_ENCODE) { if (tb[TCA_IFE_TYPE]) @@ -499,24 +514,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, saddr = nla_data(tb[TCA_IFE_SMAC]); } - if (exists) - spin_lock_bh(&ife->tcf_lock); ife->tcf_action = parm->action; if (parm->flags & IFE_ENCODE) { if (daddr) - ether_addr_copy(ife->eth_dst, daddr); + ether_addr_copy(p->eth_dst, daddr); else - eth_zero_addr(ife->eth_dst); + eth_zero_addr(p->eth_dst); if (saddr) - ether_addr_copy(ife->eth_src, saddr); + ether_addr_copy(p->eth_src, saddr); else - eth_zero_addr(ife->eth_src); + eth_zero_addr(p->eth_src); - ife->eth_type = ife_type; + p->eth_type = ife_type; } + if (exists) + spin_lock_bh(&ife->tcf_lock); + if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&ife->metalist); @@ -532,6 +548,7 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + kfree(p); return err; } @@ -552,6 +569,7 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + kfree(p); return err; } } @@ -559,6 +577,11 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + p_old = rtnl_dereference(ife->params); + rcu_assign_pointer(ife->params, p); + if (p_old) + kfree_rcu(p_old, rcu); + if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -570,12 +593,13 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p = rtnl_dereference(ife->params); struct tc_ife opt = { .index = ife->tcf_index, .refcnt = ife->tcf_refcnt - ref, .bindcnt = ife->tcf_bindcnt - bind, .action = ife->tcf_action, - .flags = ife->flags, + .flags = p->flags, }; struct tcf_t t; @@ -586,17 +610,17 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; - if (!is_zero_ether_addr(ife->eth_dst)) { - if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst)) + if (!is_zero_ether_addr(p->eth_dst)) { + if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst)) goto nla_put_failure; } - if (!is_zero_ether_addr(ife->eth_src)) { - if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src)) + if (!is_zero_ether_addr(p->eth_src)) { + if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src)) goto nla_put_failure; } - if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type)) + if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; if (dump_metalist(skb, ife)) { @@ -698,7 +722,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) } static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) + struct tcf_result *res, struct tcf_ife_params *p) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; @@ -762,19 +786,18 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, } skboff += err; } + spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; - if (!is_zero_ether_addr(ife->eth_src)) - ether_addr_copy(oethh->h_source, ife->eth_src); - if (!is_zero_ether_addr(ife->eth_dst)) - ether_addr_copy(oethh->h_dest, ife->eth_dst); - oethh->h_proto = htons(ife->eth_type); + if (!is_zero_ether_addr(p->eth_src)) + ether_addr_copy(oethh->h_source, p->eth_src); + if (!is_zero_ether_addr(p->eth_dst)) + ether_addr_copy(oethh->h_dest, p->eth_dst); + oethh->h_proto = htons(p->eth_type); if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); - spin_unlock(&ife->tcf_lock); - return action; } @@ -782,9 +805,17 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); - - if (ife->flags & IFE_ENCODE) - return tcf_ife_encode(skb, a, res); + struct tcf_ife_params *p; + int ret; + + rcu_read_lock(); + p = rcu_dereference(ife->params); + if (p->flags & IFE_ENCODE) { + ret = tcf_ife_encode(skb, a, res, p); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); return tcf_ife_decode(skb, a, res); } -- cgit v1.2.3 From 1188e2a9ef223f5c670301d54a6f65d223a87582 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 14:04:30 +0200 Subject: cfg80211: don't print log output for building shipped-certs Building an allmodconfig kernel with 'make -s' now prints a single line: GEN net/wireless/shipped-certs.c Using '$(kecho)' here will skip the output with 'make -s' but otherwise keeps printing it, which is consistent with how we handle all the other output. Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Signed-off-by: Arnd Bergmann Signed-off-by: Johannes Berg --- net/wireless/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 219baea57e4e..e585f3f71f77 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -23,7 +23,7 @@ cfg80211-y += extra-certs.o endif $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) - @echo " GEN $@" + @$(kecho) " GEN $@" @echo '#include "reg.h"' > $@ @echo 'const u8 shipped_regdb_certs[] = {' >> $@ @for f in $^ ; do hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ ; done @@ -32,7 +32,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) - @echo " GEN $@" + @$(kecho) " GEN $@" @echo '#include "reg.h"' > $@ @echo 'const u8 extra_regdb_certs[] = {' >> $@ @for f in $^ ; do test -f $$f && hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ || true ; done -- cgit v1.2.3 From 88230ef1f31bf2d8fcf42c20e5743ff4b3618a29 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 14:04:31 +0200 Subject: cfg80211: fix CFG80211_EXTRA_REGDB_KEYDIR typo The missing CONFIG_ prefix means this macro is never defined, leading to a possible Kbuild warning: net/wireless/reg.c:666:20: error: 'load_keys_from_buffer' defined but not used [-Werror=unused-function] static void __init load_keys_from_buffer(const u8 *p, unsigned int buflen) When we use the correct symbol, the warning also goes away. Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Signed-off-by: Arnd Bergmann Signed-off-by: Johannes Berg --- net/wireless/reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 58319c82ecb3..3871998059de 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -723,7 +723,7 @@ static int __init load_builtin_regdb_keys(void) #ifdef CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS load_keys_from_buffer(shipped_regdb_certs, shipped_regdb_certs_len); #endif -#ifdef CFG80211_EXTRA_REGDB_KEYDIR +#ifdef CONFIG_CFG80211_EXTRA_REGDB_KEYDIR if (CONFIG_CFG80211_EXTRA_REGDB_KEYDIR[0] != '\0') load_keys_from_buffer(extra_regdb_certs, extra_regdb_certs_len); #endif -- cgit v1.2.3 From b1b1ae2c1c150f8db5d3523c74e81eaf8cae5cbb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 13 Oct 2017 15:26:01 +0300 Subject: mac80211: don't track HT capability changes The code here (more or less accidentally) tracks the HT capability of the AP when connected, and we found at least one AP that erroneously toggles its 20/40 capability bit when changing between 20/40 MHz. The connection to the AP is then broken because we set the 40 MHz disable flag based on this, as soon as it switches to 20 MHz, but because the flag then changed, we disconnect. I'd be inclined to just ignore this issue, since we then reconnect while the AP is in 20 MHz mode and never use 40 MHz with it again, but this code is a bit strange anyway - we don't use the capabilities for anything else. Change the code to simply not track the HT capabilities at all, which assumes that the AP at least sets 20/40 capability when operating in 40 MHz (or higher). If not, rate scaling might end up using only the narrower bandwidth. The new behaviour also mirrors what VHT does, where we only check the VHT operation. Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index ee5ca1bc5a20..e4ededa1909d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -145,7 +145,6 @@ static u32 ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, - const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, struct cfg80211_chan_def *chandef, bool tracking) @@ -163,20 +162,13 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, chandef->center_freq1 = channel->center_freq; chandef->center_freq2 = 0; - if (!ht_cap || !ht_oper || !sta_ht_cap.ht_supported) { + if (!ht_oper || !sta_ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; goto out; } chandef->width = NL80211_CHAN_WIDTH_20; - if (!(ht_cap->cap_info & - cpu_to_le16(IEEE80211_HT_CAP_SUP_WIDTH_20_40))) { - ret = IEEE80211_STA_DISABLE_40MHZ; - vht_chandef = *chandef; - goto out; - } - ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, channel->band); /* check that channel matches the right operating channel */ @@ -344,7 +336,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, /* calculate new channel (type) based on HT/VHT operation IEs */ flags = ieee80211_determine_chantype(sdata, sband, chan, - ht_cap, ht_oper, vht_oper, + ht_oper, vht_oper, &chandef, true); /* @@ -4312,7 +4304,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, - ht_cap, ht_oper, vht_oper, + ht_oper, vht_oper, &chandef, false); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), -- cgit v1.2.3 From 14c04493cb77bc38404dbcb39d5ccbb667831ad7 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:17 +0200 Subject: tipc: add ability to order and receive topology events in driver As preparation for introducing communication groups, we add the ability to issue topology subscriptions and receive topology events from kernel space. This will make it possible for group member sockets to keep track of other group members. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/core.h | 5 +++ net/tipc/msg.h | 1 + net/tipc/server.c | 121 ++++++++++++++++++++++++++++++++++++++++++------------ net/tipc/server.h | 5 ++- net/tipc/socket.c | 32 +++++++++------ 5 files changed, 124 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/tipc/core.h b/net/tipc/core.h index 5cc5398be722..964342689f2c 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -132,6 +132,11 @@ static inline struct list_head *tipc_nodes(struct net *net) return &tipc_net(net)->node_list; } +static inline struct tipc_server *tipc_topsrv(struct net *net) +{ + return tipc_net(net)->topsrv; +} + static inline unsigned int tipc_hashfn(u32 addr) { return addr & (NODE_HTABLE_SIZE - 1); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index c843fd2bc48d..d058b1c464e9 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -78,6 +78,7 @@ struct plist; #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define SOCK_WAKEUP 14 /* pseudo user */ +#define TOP_SRV 15 /* pseudo user */ /* * Message header sizes diff --git a/net/tipc/server.c b/net/tipc/server.c index 3cd6402e812c..713077536d0c 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -36,6 +36,8 @@ #include "server.h" #include "core.h" #include "socket.h" +#include "addr.h" +#include "msg.h" #include #include @@ -105,13 +107,11 @@ static void tipc_conn_kref_release(struct kref *kref) kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); sock_release(sock); con->sock = NULL; - - spin_lock_bh(&s->idr_lock); - idr_remove(&s->conn_idr, con->conid); - s->idr_in_use--; - spin_unlock_bh(&s->idr_lock); } - + spin_lock_bh(&s->idr_lock); + idr_remove(&s->conn_idr, con->conid); + s->idr_in_use--; + spin_unlock_bh(&s->idr_lock); tipc_clean_outqueues(con); kfree(con); } @@ -197,7 +197,8 @@ static void tipc_close_conn(struct tipc_conn *con) struct tipc_server *s = con->server; if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { - tipc_unregister_callbacks(con); + if (con->sock) + tipc_unregister_callbacks(con); if (con->conid) s->tipc_conn_release(con->conid, con->usr_data); @@ -207,8 +208,8 @@ static void tipc_close_conn(struct tipc_conn *con) * are harmless for us here as we have already deleted this * connection from server connection list. */ - kernel_sock_shutdown(con->sock, SHUT_RDWR); - + if (con->sock) + kernel_sock_shutdown(con->sock, SHUT_RDWR); conn_put(con); } } @@ -487,38 +488,104 @@ void tipc_conn_terminate(struct tipc_server *s, int conid) } } +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, + u32 lower, u32 upper, int *conid) +{ + struct tipc_subscriber *scbr; + struct tipc_subscr sub; + struct tipc_server *s; + struct tipc_conn *con; + + sub.seq.type = type; + sub.seq.lower = lower; + sub.seq.upper = upper; + sub.timeout = TIPC_WAIT_FOREVER; + sub.filter = TIPC_SUB_PORTS; + *(u32 *)&sub.usr_handle = port; + + con = tipc_alloc_conn(tipc_topsrv(net)); + if (!con) + return false; + + *conid = con->conid; + s = con->server; + scbr = s->tipc_conn_new(*conid); + if (!scbr) { + tipc_close_conn(con); + return false; + } + + con->usr_data = scbr; + con->sock = NULL; + s->tipc_conn_recvmsg(net, *conid, NULL, scbr, &sub, sizeof(sub)); + return true; +} + +void tipc_topsrv_kern_unsubscr(struct net *net, int conid) +{ + struct tipc_conn *con; + + con = tipc_conn_lookup(tipc_topsrv(net), conid); + if (!con) + return; + tipc_close_conn(con); + conn_put(con); +} + +static void tipc_send_kern_top_evt(struct net *net, struct tipc_event *evt) +{ + u32 port = *(u32 *)&evt->s.usr_handle; + u32 self = tipc_own_addr(net); + struct sk_buff_head evtq; + struct sk_buff *skb; + + skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt), + self, self, port, port, 0); + if (!skb) + return; + msg_set_dest_droppable(buf_msg(skb), true); + memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt)); + skb_queue_head_init(&evtq); + __skb_queue_tail(&evtq, skb); + tipc_sk_rcv(net, &evtq); +} + static void tipc_send_to_sock(struct tipc_conn *con) { - int count = 0; struct tipc_server *s = con->server; struct outqueue_entry *e; + struct tipc_event *evt; struct msghdr msg; + int count = 0; int ret; spin_lock_bh(&con->outqueue_lock); while (test_bit(CF_CONNECTED, &con->flags)) { - e = list_entry(con->outqueue.next, struct outqueue_entry, - list); + e = list_entry(con->outqueue.next, struct outqueue_entry, list); if ((struct list_head *) e == &con->outqueue) break; - spin_unlock_bh(&con->outqueue_lock); - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + spin_unlock_bh(&con->outqueue_lock); - if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) { - msg.msg_name = &e->dest; - msg.msg_namelen = sizeof(struct sockaddr_tipc); - } - ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, - e->iov.iov_len); - if (ret == -EWOULDBLOCK || ret == 0) { - cond_resched(); - goto out; - } else if (ret < 0) { - goto send_err; + if (con->sock) { + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT; + if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) { + msg.msg_name = &e->dest; + msg.msg_namelen = sizeof(struct sockaddr_tipc); + } + ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, + e->iov.iov_len); + if (ret == -EWOULDBLOCK || ret == 0) { + cond_resched(); + goto out; + } else if (ret < 0) { + goto send_err; + } + } else { + evt = e->iov.iov_base; + tipc_send_kern_top_evt(s->net, evt); } - /* Don't starve users filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { cond_resched(); diff --git a/net/tipc/server.h b/net/tipc/server.h index 34f8055afa3b..2113c9192633 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -83,13 +83,16 @@ struct tipc_server { int tipc_conn_sendmsg(struct tipc_server *s, int conid, struct sockaddr_tipc *addr, void *data, size_t len); +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, + u32 lower, u32 upper, int *conid); +void tipc_topsrv_kern_unsubscr(struct net *net, int conid); + /** * tipc_conn_terminate - terminate connection with server * * Note: Must call it in process context since it might sleep */ void tipc_conn_terminate(struct tipc_server *s, int conid); - int tipc_server_start(struct tipc_server *s); void tipc_server_stop(struct tipc_server *s); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d50edd6e0019..9a7e7b5cf23f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -896,6 +896,10 @@ exit: kfree_skb(skb); } +static void tipc_sk_top_evt(struct tipc_sock *tsk, struct tipc_event *evt) +{ +} + /** * tipc_sendmsg - send message in connectionless manner * @sock: socket structure @@ -1671,20 +1675,24 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb, struct tipc_msg *hdr = buf_msg(skb); unsigned int limit = rcvbuf_limit(sk, skb); int err = TIPC_OK; - int usr = msg_user(hdr); - u32 onode; - - if (unlikely(msg_user(hdr) == CONN_MANAGER)) { - tipc_sk_proto_rcv(tsk, skb, xmitq); - return false; - } - if (unlikely(usr == SOCK_WAKEUP)) { - onode = msg_orignode(hdr); + if (unlikely(!msg_isdata(hdr))) { + switch (msg_user(hdr)) { + case CONN_MANAGER: + tipc_sk_proto_rcv(tsk, skb, xmitq); + return false; + case SOCK_WAKEUP: + u32_del(&tsk->cong_links, msg_orignode(hdr)); + tsk->cong_link_cnt--; + sk->sk_write_space(sk); + break; + case TOP_SRV: + tipc_sk_top_evt(tsk, (void *)msg_data(hdr)); + break; + default: + break; + } kfree_skb(skb); - u32_del(&tsk->cong_links, onode); - tsk->cong_link_cnt--; - sk->sk_write_space(sk); return false; } -- cgit v1.2.3 From 23998835be98a6842e5698fa1824f404c7de850d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:18 +0200 Subject: tipc: improve address sanity check in tipc_connect() The address given to tipc_connect() is not completely sanity checked, under the assumption that this will be done later in the function __tipc_sendmsg() when the address is used there. However, the latter functon will in the next commits serve as caller to several other send functions, so we want to move the corresponding sanity check there to the beginning of that function, before we possibly need to grab the address stored by tipc_connect(). We must therefore be able to trust that this address already has been thoroughly checked. We do this in this commit. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/socket.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 9a7e7b5cf23f..7659e792ecdb 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1911,28 +1911,27 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, int previous; int res = 0; + if (destlen != sizeof(struct sockaddr_tipc)) + return -EINVAL; + lock_sock(sk); - /* DGRAM/RDM connect(), just save the destaddr */ - if (tipc_sk_type_connectionless(sk)) { - if (dst->family == AF_UNSPEC) { - memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); - } else if (destlen != sizeof(struct sockaddr_tipc)) { + if (dst->family == AF_UNSPEC) { + memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); + if (!tipc_sk_type_connectionless(sk)) res = -EINVAL; - } else { - memcpy(&tsk->peer, dest, destlen); - } goto exit; + } else if (dst->family != AF_TIPC) { + res = -EINVAL; } - - /* - * Reject connection attempt using multicast address - * - * Note: send_msg() validates the rest of the address fields, - * so there's no need to do it here - */ - if (dst->addrtype == TIPC_ADDR_MCAST) { + if (dst->addrtype != TIPC_ADDR_ID && dst->addrtype != TIPC_ADDR_NAME) res = -EINVAL; + if (res) + goto exit; + + /* DGRAM/RDM connect(), just save the destaddr */ + if (tipc_sk_type_connectionless(sk)) { + memcpy(&tsk->peer, dest, destlen); goto exit; } -- cgit v1.2.3 From 38077b8ef831daba55913f7e24732b062d0bdebb Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:19 +0200 Subject: tipc: add ability to obtain node availability status from other files In the coming commits, functions at the socket level will need the ability to read the availability status of a given node. We therefore introduce a new function for this purpose, while renaming the existing static function currently having the wanted name. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/node.c | 26 +++++++++++++++++++++----- net/tipc/node.h | 1 + 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 198dbc7adbe1..6cc1ae600820 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -157,7 +157,7 @@ static void tipc_node_timeout(unsigned long data); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static void tipc_node_put(struct tipc_node *node); -static bool tipc_node_is_up(struct tipc_node *n); +static bool node_is_up(struct tipc_node *n); struct tipc_sock_conn { u32 port; @@ -657,7 +657,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, *slot1 = i; } - if (!tipc_node_is_up(n)) { + if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); @@ -717,11 +717,27 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) tipc_sk_rcv(n->net, &le->inputq); } -static bool tipc_node_is_up(struct tipc_node *n) +static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } +bool tipc_node_is_up(struct net *net, u32 addr) +{ + struct tipc_node *n; + bool retval = false; + + if (in_own_node(net, addr)) + return true; + + n = tipc_node_find(net, addr); + if (!n) + return false; + retval = node_is_up(n); + tipc_node_put(n); + return retval; +} + void tipc_node_check_dest(struct net *net, u32 onode, struct tipc_bearer *b, u16 capabilities, u32 signature, @@ -1149,7 +1165,7 @@ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; - if (tipc_node_is_up(node)) + if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; @@ -1249,7 +1265,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb) dst = n->addr; if (in_own_node(net, dst)) continue; - if (!tipc_node_is_up(n)) + if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) diff --git a/net/tipc/node.h b/net/tipc/node.h index 898c22916984..8db59feb122f 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -76,6 +76,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +bool tipc_node_is_up(struct net *net, u32 addr); u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); -- cgit v1.2.3 From 64ac5f5977df5b276374fb2f051082129f5cdb22 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:20 +0200 Subject: tipc: refactor function filter_rcv() In the following commits we will need to handle multiple incoming and rejected/returned buffers in the function socket.c::filter_rcv(). As a preparation for this, we generalize the function by handling buffer queues instead of individual buffers. We also introduce a help function tipc_skb_reject(), and rename filter_rcv() to tipc_sk_filter_rcv() in line with other functions in socket.c. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/msg.c | 7 +++ net/tipc/msg.h | 2 + net/tipc/socket.c | 161 +++++++++++++++++++++++++++--------------------------- 3 files changed, 89 insertions(+), 81 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 17146c16ee2d..1649d456e22d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -666,3 +666,10 @@ void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, } kfree_skb(skb); } + +void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, + struct sk_buff_head *xmitq) +{ + if (tipc_msg_reverse(tipc_own_addr(net), &skb, err)) + __skb_queue_tail(xmitq, skb); +} diff --git a/net/tipc/msg.h b/net/tipc/msg.h index d058b1c464e9..be3e38aa9dd2 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -819,6 +819,8 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); +void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, + struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7659e792ecdb..bc226f5a1be3 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -111,7 +111,7 @@ struct tipc_sock { struct rcu_head rcu; }; -static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); +static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); @@ -453,7 +453,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, msg_set_origport(msg, tsk->portid); setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); sk->sk_shutdown = 0; - sk->sk_backlog_rcv = tipc_backlog_rcv; + sk->sk_backlog_rcv = tipc_sk_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; @@ -850,16 +850,16 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, } /** - * tipc_sk_proto_rcv - receive a connection mng protocol message + * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket * @skb: pointer to message buffer. */ -static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, - struct sk_buff_head *xmitq) +static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { - struct sock *sk = &tsk->sk; - u32 onode = tsk_own_node(tsk); struct tipc_msg *hdr = buf_msg(skb); + u32 onode = tsk_own_node(tsk); + struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); bool conn_cong; @@ -1536,14 +1536,41 @@ static void tipc_sock_destruct(struct sock *sk) __skb_queue_purge(&sk->sk_receive_queue); } +static void tipc_sk_proto_rcv(struct sock *sk, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb = __skb_dequeue(inputq); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = buf_msg(skb); + + switch (msg_user(hdr)) { + case CONN_MANAGER: + tipc_sk_conn_proto_rcv(tsk, skb, xmitq); + return; + case SOCK_WAKEUP: + u32_del(&tsk->cong_links, msg_orignode(hdr)); + tsk->cong_link_cnt--; + sk->sk_write_space(sk); + break; + case TOP_SRV: + tipc_sk_top_evt(tsk, (void *)msg_data(hdr)); + break; + default: + break; + } + + kfree_skb(skb); +} + /** - * filter_connect - Handle all incoming messages for a connection-based socket + * tipc_filter_connect - Handle incoming message for a connection-based socket * @tsk: TIPC socket * @skb: pointer to message buffer. Set to NULL if buffer is consumed * * Returns true if everything ok, false otherwise */ -static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) +static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); @@ -1657,7 +1684,7 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) } /** - * filter_rcv - validate incoming message + * tipc_sk_filter_rcv - validate incoming message * @sk: socket * @skb: pointer to message. * @@ -1666,75 +1693,49 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) * * Called with socket lock already taken * - * Returns true if message was added to socket receive queue, otherwise false */ -static bool filter_rcv(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *xmitq) +static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { + bool sk_conn = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); - unsigned int limit = rcvbuf_limit(sk, skb); - int err = TIPC_OK; + struct net *net = sock_net(sk); + struct sk_buff_head inputq; + int limit, err = TIPC_OK; - if (unlikely(!msg_isdata(hdr))) { - switch (msg_user(hdr)) { - case CONN_MANAGER: - tipc_sk_proto_rcv(tsk, skb, xmitq); - return false; - case SOCK_WAKEUP: - u32_del(&tsk->cong_links, msg_orignode(hdr)); - tsk->cong_link_cnt--; - sk->sk_write_space(sk); - break; - case TOP_SRV: - tipc_sk_top_evt(tsk, (void *)msg_data(hdr)); - break; - default: - break; - } - kfree_skb(skb); - return false; - } + TIPC_SKB_CB(skb)->bytes_read = 0; + __skb_queue_head_init(&inputq); + __skb_queue_tail(&inputq, skb); - /* Drop if illegal message type */ - if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) { - kfree_skb(skb); - return false; - } + if (unlikely(!msg_isdata(hdr))) + tipc_sk_proto_rcv(sk, &inputq, xmitq); + else if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) + return kfree_skb(skb); - /* Reject if wrong message type for current socket state */ - if (tipc_sk_type_connectionless(sk)) { - if (msg_connected(hdr)) { + /* Validate and add to receive buffer if there is space */ + while ((skb = __skb_dequeue(&inputq))) { + hdr = buf_msg(skb); + limit = rcvbuf_limit(sk, skb); + if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) || + (!sk_conn && msg_connected(hdr))) err = TIPC_ERR_NO_PORT; - goto reject; - } - } else if (unlikely(!filter_connect(tsk, skb))) { - err = TIPC_ERR_NO_PORT; - goto reject; - } + else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) + err = TIPC_ERR_OVERLOAD; - /* Reject message if there isn't room to queue it */ - if (unlikely(sk_rmem_alloc_get(sk) + skb->truesize >= limit)) { - err = TIPC_ERR_OVERLOAD; - goto reject; + if (unlikely(err)) { + tipc_skb_reject(net, err, skb, xmitq); + err = TIPC_OK; + continue; + } + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk); } - - /* Enqueue message */ - TIPC_SKB_CB(skb)->bytes_read = 0; - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - - sk->sk_data_ready(sk); - return true; - -reject: - if (tipc_msg_reverse(tsk_own_node(tsk), &skb, err)) - __skb_queue_tail(xmitq, skb); - return false; } /** - * tipc_backlog_rcv - handle incoming message from backlog queue + * tipc_sk_backlog_rcv - handle incoming message from backlog queue * @sk: socket * @skb: message * @@ -1742,27 +1743,25 @@ reject: * * Returns 0 */ -static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) +static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { - unsigned int truesize = skb->truesize; + unsigned int before = sk_rmem_alloc_get(sk); struct sk_buff_head xmitq; u32 dnode, selector; + unsigned int added; __skb_queue_head_init(&xmitq); - if (likely(filter_rcv(sk, skb, &xmitq))) { - atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt); - return 0; - } - - if (skb_queue_empty(&xmitq)) - return 0; + tipc_sk_filter_rcv(sk, skb, &xmitq); + added = sk_rmem_alloc_get(sk) - before; + atomic_add(added, &tipc_sk(sk)->dupl_rcvcnt); - /* Send response/rejected message */ - skb = __skb_dequeue(&xmitq); - dnode = msg_destnode(buf_msg(skb)); - selector = msg_origport(buf_msg(skb)); - tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); + /* Send pending response/rejected messages, if any */ + while ((skb = __skb_dequeue(&xmitq))) { + selector = msg_origport(buf_msg(skb)); + dnode = msg_destnode(buf_msg(skb)); + tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); + } return 0; } @@ -1794,7 +1793,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Add message directly to receive queue if possible */ if (!sock_owned_by_user(sk)) { - filter_rcv(sk, skb, xmitq); + tipc_sk_filter_rcv(sk, skb, xmitq); continue; } -- cgit v1.2.3 From f70d37b796241f617107d5585ee96a7e1b660b63 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:21 +0200 Subject: tipc: add new function for sending multiple small messages We see an increasing need to send multiple single-buffer messages of TIPC_SYSTEM_IMPORTANCE to different individual destination nodes. Instead of looping over the send queue and sending each buffer individually, as we do now, we add a new help function tipc_node_distr_xmit() to do this. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/node.c | 16 ++++++++++++++++ net/tipc/node.h | 1 + net/tipc/socket.c | 14 ++------------ 3 files changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 6cc1ae600820..89f8ac73bf65 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1254,6 +1254,22 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, return 0; } +/* tipc_node_distr_xmit(): send single buffer msgs to individual destinations + * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected + */ +int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) +{ + struct sk_buff *skb; + u32 selector, dnode; + + while ((skb = __skb_dequeue(xmitq))) { + selector = msg_origport(buf_msg(skb)); + dnode = msg_destnode(buf_msg(skb)); + tipc_node_xmit_skb(net, skb, dnode, selector); + } + return 0; +} + void tipc_node_broadcast(struct net *net, struct sk_buff *skb) { struct sk_buff *txskb; diff --git a/net/tipc/node.h b/net/tipc/node.h index 8db59feb122f..df2f2197c4ad 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -68,6 +68,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector); +int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *list); int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, u32 selector); void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index bc226f5a1be3..c7c674934474 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1740,14 +1740,11 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, * @skb: message * * Caller must hold socket lock - * - * Returns 0 */ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { unsigned int before = sk_rmem_alloc_get(sk); struct sk_buff_head xmitq; - u32 dnode, selector; unsigned int added; __skb_queue_head_init(&xmitq); @@ -1757,11 +1754,7 @@ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) atomic_add(added, &tipc_sk(sk)->dupl_rcvcnt); /* Send pending response/rejected messages, if any */ - while ((skb = __skb_dequeue(&xmitq))) { - selector = msg_origport(buf_msg(skb)); - dnode = msg_destnode(buf_msg(skb)); - tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); - } + tipc_node_distr_xmit(sock_net(sk), &xmitq); return 0; } @@ -1840,10 +1833,7 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) spin_unlock_bh(&sk->sk_lock.slock); } /* Send pending response/rejected messages, if any */ - while ((skb = __skb_dequeue(&xmitq))) { - dnode = msg_destnode(buf_msg(skb)); - tipc_node_xmit_skb(net, skb, dnode, dport); - } + tipc_node_distr_xmit(sock_net(sk), &xmitq); sock_put(sk); continue; } -- cgit v1.2.3 From a80ae5306a7346d4e52f59462878beb8362f4bbd Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:22 +0200 Subject: tipc: improve destination linked list We often see a need for a linked list of destination identities, sometimes containing a port number, sometimes a node identity, and sometimes both. The currently defined struct u32_list is not generic enough to cover all cases, so we extend it to contain two u32 integers and rename it to struct tipc_dest_list. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bcast.c | 18 +++++------ net/tipc/name_table.c | 89 ++++++++++++++++++++++++++------------------------- net/tipc/name_table.h | 22 ++++++++----- net/tipc/socket.c | 12 +++---- 4 files changed, 74 insertions(+), 67 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index a140dd4a84af..329325bd553e 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -258,20 +258,20 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_nlist *dests, u16 *cong_link_cnt) { + struct tipc_dest *dst, *tmp; struct sk_buff_head _pkts; - struct u32_item *n, *tmp; - u32 dst, selector; + u32 dnode, selector; selector = msg_link_selector(buf_msg(skb_peek(pkts))); skb_queue_head_init(&_pkts); - list_for_each_entry_safe(n, tmp, &dests->list, list) { - dst = n->value; - if (!tipc_msg_pskb_copy(dst, pkts, &_pkts)) + list_for_each_entry_safe(dst, tmp, &dests->list, list) { + dnode = dst->node; + if (!tipc_msg_pskb_copy(dnode, pkts, &_pkts)) return -ENOMEM; /* Any other return value than -ELINKCONG is ignored */ - if (tipc_node_xmit(net, &_pkts, dst, selector) == -ELINKCONG) + if (tipc_node_xmit(net, &_pkts, dnode, selector) == -ELINKCONG) (*cong_link_cnt)++; } return 0; @@ -554,7 +554,7 @@ void tipc_nlist_add(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = true; - else if (u32_push(&nl->list, node)) + else if (tipc_dest_push(&nl->list, node, 0)) nl->remote++; } @@ -562,13 +562,13 @@ void tipc_nlist_del(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = false; - else if (u32_del(&nl->list, node)) + else if (tipc_dest_del(&nl->list, node, 0)) nl->remote--; } void tipc_nlist_purge(struct tipc_nlist *nl) { - u32_list_purge(&nl->list); + tipc_dest_list_purge(&nl->list); nl->remote = 0; nl->local = 0; } diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index bd0aac87b41a..76bd2777baaf 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -634,7 +634,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, info = sseq->info; list_for_each_entry(publ, &info->node_list, node_list) { if (publ->scope <= limit) - u32_push(dports, publ->ref); + tipc_dest_push(dports, 0, publ->ref); } if (info->cluster_list_size != info->node_list_size) @@ -1057,78 +1057,79 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -bool u32_find(struct list_head *l, u32 value) +struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { - struct u32_item *item; + u64 value = (u64)node << 32 | port; + struct tipc_dest *dst; - list_for_each_entry(item, l, list) { - if (item->value == value) - return true; + list_for_each_entry(dst, l, list) { + if (dst->value != value) + continue; + return dst; } - return false; + return NULL; } -bool u32_push(struct list_head *l, u32 value) +bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { - struct u32_item *item; + u64 value = (u64)node << 32 | port; + struct tipc_dest *dst; - list_for_each_entry(item, l, list) { - if (item->value == value) - return false; - } - item = kmalloc(sizeof(*item), GFP_ATOMIC); - if (unlikely(!item)) + if (tipc_dest_find(l, node, port)) return false; - item->value = value; - list_add(&item->list, l); + dst = kmalloc(sizeof(*dst), GFP_ATOMIC); + if (unlikely(!dst)) + return false; + dst->value = value; + list_add(&dst->list, l); return true; } -u32 u32_pop(struct list_head *l) +bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port) { - struct u32_item *item; - u32 value = 0; + struct tipc_dest *dst; if (list_empty(l)) - return 0; - item = list_first_entry(l, typeof(*item), list); - value = item->value; - list_del(&item->list); - kfree(item); - return value; + return false; + dst = list_first_entry(l, typeof(*dst), list); + if (port) + *port = dst->port; + if (node) + *node = dst->node; + list_del(&dst->list); + kfree(dst); + return true; } -bool u32_del(struct list_head *l, u32 value) +bool tipc_dest_del(struct list_head *l, u32 node, u32 port) { - struct u32_item *item, *tmp; + struct tipc_dest *dst; - list_for_each_entry_safe(item, tmp, l, list) { - if (item->value != value) - continue; - list_del(&item->list); - kfree(item); - return true; - } - return false; + dst = tipc_dest_find(l, node, port); + if (!dst) + return false; + list_del(&dst->list); + kfree(dst); + return true; } -void u32_list_purge(struct list_head *l) +void tipc_dest_list_purge(struct list_head *l) { - struct u32_item *item, *tmp; + struct tipc_dest *dst, *tmp; - list_for_each_entry_safe(item, tmp, l, list) { - list_del(&item->list); - kfree(item); + list_for_each_entry_safe(dst, tmp, l, list) { + list_del(&dst->list); + kfree(dst); } } -int u32_list_len(struct list_head *l) +int tipc_dest_list_len(struct list_head *l) { - struct u32_item *item; + struct tipc_dest *dst; int i = 0; - list_for_each_entry(item, l, list) { + list_for_each_entry(dst, l, list) { i++; } return i; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 6ebdeb1d84a5..d121175a92b5 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -120,16 +120,22 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); void tipc_nametbl_stop(struct net *net); -struct u32_item { +struct tipc_dest { struct list_head list; - u32 value; + union { + struct { + u32 port; + u32 node; + }; + u64 value; + }; }; -bool u32_push(struct list_head *l, u32 value); -u32 u32_pop(struct list_head *l); -bool u32_find(struct list_head *l, u32 value); -bool u32_del(struct list_head *l, u32 value); -void u32_list_purge(struct list_head *l); -int u32_list_len(struct list_head *l); +struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port); +bool tipc_dest_push(struct list_head *l, u32 node, u32 port); +bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port); +bool tipc_dest_del(struct list_head *l, u32 node, u32 port); +void tipc_dest_list_purge(struct list_head *l); +int tipc_dest_list_len(struct list_head *l); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index c7c674934474..daf7c4df4531 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -565,7 +565,7 @@ static int tipc_release(struct socket *sock) /* Reject any messages that accumulated in backlog queue */ release_sock(sk); - u32_list_purge(&tsk->cong_links); + tipc_dest_list_purge(&tsk->cong_links); tsk->cong_link_cnt = 0; call_rcu(&tsk->rcu, tipc_sk_callback); sock->sk = NULL; @@ -826,8 +826,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, tipc_nametbl_mc_translate(net, msg_nametype(msg), msg_namelower(msg), msg_nameupper(msg), scope, &dports); - portid = u32_pop(&dports); - for (; portid; portid = u32_pop(&dports)) { + while (tipc_dest_pop(&dports, NULL, &portid)) { _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); @@ -1000,7 +999,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) } /* Block or return if destination link is congested */ - rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode)); + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(clinks, dnode, 0)); if (unlikely(rc)) return rc; @@ -1012,7 +1012,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { - u32_push(clinks, dnode); + tipc_dest_push(clinks, dnode, 0); tsk->cong_link_cnt++; rc = 0; } @@ -1549,7 +1549,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, tipc_sk_conn_proto_rcv(tsk, skb, xmitq); return; case SOCK_WAKEUP: - u32_del(&tsk->cong_links, msg_orignode(hdr)); + tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); tsk->cong_link_cnt--; sk->sk_write_space(sk); break; -- cgit v1.2.3 From 75da2163dbb6af9f2dce1d80056d11d290dd19a5 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:23 +0200 Subject: tipc: introduce communication groups As a preparation for introducing flow control for multicast and datagram messaging we need a more strictly defined framework than we have now. A socket must be able keep track of exactly how many and which other sockets it is allowed to communicate with at any moment, and keep the necessary state for those. We therefore introduce a new concept we have named Communication Group. Sockets can join a group via a new setsockopt() call TIPC_GROUP_JOIN. The call takes four parameters: 'type' serves as group identifier, 'instance' serves as an logical member identifier, and 'scope' indicates the visibility of the group (node/cluster/zone). Finally, 'flags' makes it possible to set certain properties for the member. For now, there is only one flag, indicating if the creator of the socket wants to receive a copy of broadcast or multicast messages it is sending via the socket, and if wants to be eligible as destination for its own anycasts. A group is closed, i.e., sockets which have not joined a group will not be able to send messages to or receive messages from members of the group, and vice versa. Any member of a group can send multicast ('group broadcast') messages to all group members, optionally including itself, using the primitive send(). The messages are received via the recvmsg() primitive. A socket can only be member of one group at a time. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/Makefile | 2 +- net/tipc/group.c | 404 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/tipc/group.h | 64 ++++++++ net/tipc/link.c | 3 +- net/tipc/msg.h | 50 ++++++- net/tipc/name_table.c | 44 ++++-- net/tipc/name_table.h | 3 + net/tipc/node.h | 3 +- net/tipc/socket.c | 209 ++++++++++++++++++++++---- 9 files changed, 734 insertions(+), 48 deletions(-) create mode 100644 net/tipc/group.c create mode 100644 net/tipc/group.h (limited to 'net') diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 31b9f9c52974..a3af73ec0b78 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -8,7 +8,7 @@ tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ - server.o socket.o + server.o socket.o group.o tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o diff --git a/net/tipc/group.c b/net/tipc/group.c new file mode 100644 index 000000000000..3f0e1ce1e3b9 --- /dev/null +++ b/net/tipc/group.c @@ -0,0 +1,404 @@ +/* + * net/tipc/group.c: TIPC group messaging code + * + * Copyright (c) 2017, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "addr.h" +#include "group.h" +#include "bcast.h" +#include "server.h" +#include "msg.h" +#include "socket.h" +#include "node.h" +#include "name_table.h" +#include "subscr.h" + +#define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) +#define ADV_IDLE ADV_UNIT + +enum mbr_state { + MBR_QUARANTINED, + MBR_DISCOVERED, + MBR_JOINING, + MBR_PUBLISHED, + MBR_JOINED, + MBR_LEAVING +}; + +struct tipc_member { + struct rb_node tree_node; + struct list_head list; + u32 node; + u32 port; + enum mbr_state state; + u16 bc_rcv_nxt; +}; + +struct tipc_group { + struct rb_root members; + struct tipc_nlist dests; + struct net *net; + int subid; + u32 type; + u32 instance; + u32 domain; + u32 scope; + u32 portid; + u16 member_cnt; + u16 bc_snd_nxt; + bool loopback; +}; + +static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, + int mtyp, struct sk_buff_head *xmitq); + +u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) +{ + return grp->bc_snd_nxt; +} + +static bool tipc_group_is_receiver(struct tipc_member *m) +{ + return m && m->state >= MBR_JOINED; +} + +int tipc_group_size(struct tipc_group *grp) +{ + return grp->member_cnt; +} + +struct tipc_group *tipc_group_create(struct net *net, u32 portid, + struct tipc_group_req *mreq) +{ + struct tipc_group *grp; + u32 type = mreq->type; + + grp = kzalloc(sizeof(*grp), GFP_ATOMIC); + if (!grp) + return NULL; + tipc_nlist_init(&grp->dests, tipc_own_addr(net)); + grp->members = RB_ROOT; + grp->net = net; + grp->portid = portid; + grp->domain = addr_domain(net, mreq->scope); + grp->type = type; + grp->instance = mreq->instance; + grp->scope = mreq->scope; + grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; + if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid)) + return grp; + kfree(grp); + return NULL; +} + +void tipc_group_delete(struct net *net, struct tipc_group *grp) +{ + struct rb_root *tree = &grp->members; + struct tipc_member *m, *tmp; + struct sk_buff_head xmitq; + + __skb_queue_head_init(&xmitq); + + rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { + tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq); + list_del(&m->list); + kfree(m); + } + tipc_node_distr_xmit(net, &xmitq); + tipc_nlist_purge(&grp->dests); + tipc_topsrv_kern_unsubscr(net, grp->subid); + kfree(grp); +} + +struct tipc_member *tipc_group_find_member(struct tipc_group *grp, + u32 node, u32 port) +{ + struct rb_node *n = grp->members.rb_node; + u64 nkey, key = (u64)node << 32 | port; + struct tipc_member *m; + + while (n) { + m = container_of(n, struct tipc_member, tree_node); + nkey = (u64)m->node << 32 | m->port; + if (key < nkey) + n = n->rb_left; + else if (key > nkey) + n = n->rb_right; + else + return m; + } + return NULL; +} + +static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, + u32 node) +{ + struct tipc_member *m; + struct rb_node *n; + + for (n = rb_first(&grp->members); n; n = rb_next(n)) { + m = container_of(n, struct tipc_member, tree_node); + if (m->node == node) + return m; + } + return NULL; +} + +static void tipc_group_add_to_tree(struct tipc_group *grp, + struct tipc_member *m) +{ + u64 nkey, key = (u64)m->node << 32 | m->port; + struct rb_node **n, *parent = NULL; + struct tipc_member *tmp; + + n = &grp->members.rb_node; + while (*n) { + tmp = container_of(*n, struct tipc_member, tree_node); + parent = *n; + tmp = container_of(parent, struct tipc_member, tree_node); + nkey = (u64)tmp->node << 32 | tmp->port; + if (key < nkey) + n = &(*n)->rb_left; + else if (key > nkey) + n = &(*n)->rb_right; + else + return; + } + rb_link_node(&m->tree_node, parent, n); + rb_insert_color(&m->tree_node, &grp->members); +} + +static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, + u32 node, u32 port, + int state) +{ + struct tipc_member *m; + + m = kzalloc(sizeof(*m), GFP_ATOMIC); + if (!m) + return NULL; + INIT_LIST_HEAD(&m->list); + m->node = node; + m->port = port; + grp->member_cnt++; + tipc_group_add_to_tree(grp, m); + tipc_nlist_add(&grp->dests, m->node); + m->state = state; + return m; +} + +void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port) +{ + tipc_group_create_member(grp, node, port, MBR_DISCOVERED); +} + +static void tipc_group_delete_member(struct tipc_group *grp, + struct tipc_member *m) +{ + rb_erase(&m->tree_node, &grp->members); + grp->member_cnt--; + list_del_init(&m->list); + + /* If last member on a node, remove node from dest list */ + if (!tipc_group_find_node(grp, m->node)) + tipc_nlist_del(&grp->dests, m->node); + + kfree(m); +} + +struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) +{ + return &grp->dests; +} + +void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, + int *scope) +{ + seq->type = grp->type; + seq->lower = grp->instance; + seq->upper = grp->instance; + *scope = grp->scope; +} + +void tipc_group_update_bc_members(struct tipc_group *grp) +{ + grp->bc_snd_nxt++; +} + +/* tipc_group_filter_msg() - determine if we should accept arriving message + */ +void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb = __skb_dequeue(inputq); + struct tipc_member *m; + struct tipc_msg *hdr; + u32 node, port; + int mtyp; + + if (!skb) + return; + + hdr = buf_msg(skb); + mtyp = msg_type(hdr); + node = msg_orignode(hdr); + port = msg_origport(hdr); + + if (!msg_in_group(hdr)) + goto drop; + + m = tipc_group_find_member(grp, node, port); + if (!tipc_group_is_receiver(m)) + goto drop; + + __skb_queue_tail(inputq, skb); + + m->bc_rcv_nxt = msg_grp_bc_seqno(hdr) + 1; + return; +drop: + kfree_skb(skb); +} + +static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, + int mtyp, struct sk_buff_head *xmitq) +{ + struct tipc_msg *hdr; + struct sk_buff *skb; + + skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, + m->node, tipc_own_addr(grp->net), + m->port, grp->portid, 0); + if (!skb) + return; + + hdr = buf_msg(skb); + if (mtyp == GRP_JOIN_MSG) + msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); + __skb_queue_tail(xmitq, skb); +} + +void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr, + struct sk_buff_head *xmitq) +{ + u32 node = msg_orignode(hdr); + u32 port = msg_origport(hdr); + struct tipc_member *m; + + if (!grp) + return; + + m = tipc_group_find_member(grp, node, port); + + switch (msg_type(hdr)) { + case GRP_JOIN_MSG: + if (!m) + m = tipc_group_create_member(grp, node, port, + MBR_QUARANTINED); + if (!m) + return; + m->bc_rcv_nxt = msg_grp_bc_syncpt(hdr); + + /* Wait until PUBLISH event is received */ + if (m->state == MBR_DISCOVERED) + m->state = MBR_JOINING; + else if (m->state == MBR_PUBLISHED) + m->state = MBR_JOINED; + return; + case GRP_LEAVE_MSG: + if (!m) + return; + + /* Wait until WITHDRAW event is received */ + if (m->state != MBR_LEAVING) { + m->state = MBR_LEAVING; + return; + } + /* Otherwise deliver already received WITHDRAW event */ + tipc_group_delete_member(grp, m); + return; + default: + pr_warn("Received unknown GROUP_PROTO message\n"); + } +} + +/* tipc_group_member_evt() - receive and handle a member up/down event + */ +void tipc_group_member_evt(struct tipc_group *grp, + struct sk_buff *skb, + struct sk_buff_head *xmitq) +{ + struct tipc_msg *hdr = buf_msg(skb); + struct tipc_event *evt = (void *)msg_data(hdr); + u32 node = evt->port.node; + u32 port = evt->port.ref; + struct tipc_member *m; + struct net *net; + u32 self; + + if (!grp) + goto drop; + + net = grp->net; + self = tipc_own_addr(net); + if (!grp->loopback && node == self && port == grp->portid) + goto drop; + + m = tipc_group_find_member(grp, node, port); + + if (evt->event == TIPC_PUBLISHED) { + if (!m) + m = tipc_group_create_member(grp, node, port, + MBR_DISCOVERED); + if (!m) + goto drop; + + /* Wait if JOIN message not yet received */ + if (m->state == MBR_DISCOVERED) + m->state = MBR_PUBLISHED; + else + m->state = MBR_JOINED; + tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); + } else if (evt->event == TIPC_WITHDRAWN) { + if (!m) + goto drop; + + /* Keep back event if more messages might be expected */ + if (m->state != MBR_LEAVING && tipc_node_is_up(net, node)) + m->state = MBR_LEAVING; + else + tipc_group_delete_member(grp, m); + } +drop: + kfree_skb(skb); +} diff --git a/net/tipc/group.h b/net/tipc/group.h new file mode 100644 index 000000000000..9bdf4479fc03 --- /dev/null +++ b/net/tipc/group.h @@ -0,0 +1,64 @@ +/* + * net/tipc/group.h: Include file for TIPC group unicast/multicast functions + * + * Copyright (c) 2017, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_GROUP_H +#define _TIPC_GROUP_H + +#include "core.h" + +struct tipc_group; +struct tipc_member; +struct tipc_msg; + +struct tipc_group *tipc_group_create(struct net *net, u32 portid, + struct tipc_group_req *mreq); +void tipc_group_delete(struct net *net, struct tipc_group *grp); +void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port); +struct tipc_nlist *tipc_group_dests(struct tipc_group *grp); +void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, + int *scope); +void tipc_group_filter_msg(struct tipc_group *grp, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq); +void tipc_group_member_evt(struct tipc_group *grp, + struct sk_buff *skb, + struct sk_buff_head *xmitq); +void tipc_group_proto_rcv(struct tipc_group *grp, + struct tipc_msg *hdr, + struct sk_buff_head *xmitq); +void tipc_group_update_bc_members(struct tipc_group *grp); +u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); +int tipc_group_size(struct tipc_group *grp); +#endif diff --git a/net/tipc/link.c b/net/tipc/link.c index ac0144f532aa..bd25bff63925 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1046,11 +1046,12 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: - if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) { + if (unlikely(msg_mcast(hdr))) { skb_queue_tail(l->bc_rcvlink->inputq, skb); return true; } case CONN_MANAGER: + case GROUP_PROTOCOL: skb_queue_tail(inputq, skb); return true; case NAME_DISTRIBUTOR: diff --git a/net/tipc/msg.h b/net/tipc/msg.h index be3e38aa9dd2..dad400935405 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -1,7 +1,7 @@ /* * net/tipc/msg.h: Include file for TIPC message header routines * - * Copyright (c) 2000-2007, 2014-2015 Ericsson AB + * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * @@ -61,10 +61,11 @@ struct plist; /* * Payload message types */ -#define TIPC_CONN_MSG 0 -#define TIPC_MCAST_MSG 1 -#define TIPC_NAMED_MSG 2 -#define TIPC_DIRECT_MSG 3 +#define TIPC_CONN_MSG 0 +#define TIPC_MCAST_MSG 1 +#define TIPC_NAMED_MSG 2 +#define TIPC_DIRECT_MSG 3 +#define TIPC_GRP_BCAST_MSG 4 /* * Internal message users @@ -73,6 +74,7 @@ struct plist; #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 +#define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 @@ -87,6 +89,7 @@ struct plist; #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ +#define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ @@ -252,6 +255,11 @@ static inline void msg_set_type(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 29, 0x7, n); } +static inline int msg_in_group(struct tipc_msg *m) +{ + return (msg_type(m) == TIPC_GRP_BCAST_MSG); +} + static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; @@ -259,7 +267,9 @@ static inline u32 msg_named(struct tipc_msg *m) static inline u32 msg_mcast(struct tipc_msg *m) { - return msg_type(m) == TIPC_MCAST_MSG; + int mtyp = msg_type(m); + + return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) @@ -514,6 +524,12 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 +/* + * Group protocol message types + */ +#define GRP_JOIN_MSG 0 +#define GRP_LEAVE_MSG 1 + /* * Word 1 */ @@ -795,6 +811,28 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } +static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +/* Word 10 + */ +static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) +{ + return msg_bits(m, 10, 16, 0xffff); +} + +static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 10, 16, 0xffff, n); +} + static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 76bd2777baaf..114d72bab827 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -43,6 +43,7 @@ #include "bcast.h" #include "addr.h" #include "node.h" +#include "group.h" #include #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ @@ -596,18 +597,6 @@ not_found: return ref; } -/** - * tipc_nametbl_mc_translate - find multicast destinations - * - * Creates list of all local ports that overlap the given multicast address; - * also determines if any off-node ports overlap. - * - * Note: Publications with a scope narrower than 'limit' are ignored. - * (i.e. local node-scope publications mustn't receive messages arriving - * from another node, even if the multcast link brought it here) - * - * Returns non-zero if any off-node ports overlap - */ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, u32 limit, struct list_head *dports) { @@ -679,6 +668,37 @@ exit: rcu_read_unlock(); } +/* tipc_nametbl_build_group - build list of communication group members + */ +void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, + u32 type, u32 domain) +{ + struct sub_seq *sseq, *stop; + struct name_info *info; + struct publication *p; + struct name_seq *seq; + + rcu_read_lock(); + seq = nametbl_find_seq(net, type); + if (!seq) + goto exit; + + spin_lock_bh(&seq->lock); + sseq = seq->sseqs; + stop = seq->sseqs + seq->first_free; + for (; sseq != stop; sseq++) { + info = sseq->info; + list_for_each_entry(p, &info->zone_list, zone_list) { + if (!tipc_in_scope(domain, p->node)) + continue; + tipc_group_add_member(grp, p->node, p->ref); + } + } + spin_unlock_bh(&seq->lock); +exit: + rcu_read_unlock(); +} + /* * tipc_nametbl_publish - add name publication to network name tables */ diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index d121175a92b5..97646b17a4a2 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -40,6 +40,7 @@ struct tipc_subscription; struct tipc_plist; struct tipc_nlist; +struct tipc_group; /* * TIPC name types reserved for internal TIPC use (both current and planned) @@ -101,6 +102,8 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, u32 limit, struct list_head *dports); +void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, + u32 type, u32 domain); void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, u32 upper, u32 domain, struct tipc_nlist *nodes); diff --git a/net/tipc/node.h b/net/tipc/node.h index df2f2197c4ad..acd58d23a70e 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -48,7 +48,8 @@ enum { TIPC_BCAST_SYNCH = (1 << 1), TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), - TIPC_BCAST_RCAST = (1 << 4) + TIPC_BCAST_RCAST = (1 << 4), + TIPC_MCAST_GROUPS = (1 << 5) }; #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ diff --git a/net/tipc/socket.c b/net/tipc/socket.c index daf7c4df4531..64bbf9d03629 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1,7 +1,7 @@ /* * net/tipc/socket.c: TIPC socket API * - * Copyright (c) 2001-2007, 2012-2016, Ericsson AB + * Copyright (c) 2001-2007, 2012-2017, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems * All rights reserved. * @@ -45,6 +45,7 @@ #include "socket.h" #include "bcast.h" #include "netlink.h" +#include "group.h" #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */ @@ -78,7 +79,7 @@ enum { * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue * @cong_link_cnt: number of congested links - * @sent_unacked: # messages sent by socket, and not yet acked by peer + * @snt_unacked: # messages sent by socket, and not yet acked by peer * @rcv_unacked: # messages read by user, but not yet acked back to peer * @peer: 'connected' peer for dgram/rdm * @node: hash table node @@ -109,6 +110,7 @@ struct tipc_sock { struct rhash_head node; struct tipc_mc_method mc_method; struct rcu_head rcu; + struct tipc_group *group; }; static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); @@ -123,6 +125,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); +static int tipc_sk_leave(struct tipc_sock *tsk); static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); @@ -559,6 +562,7 @@ static int tipc_release(struct socket *sock) __tipc_shutdown(sock, TIPC_ERR_NO_PORT); sk->sk_shutdown = SHUTDOWN_MASK; + tipc_sk_leave(tsk); tipc_sk_withdraw(tsk, 0, NULL); sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); @@ -601,7 +605,10 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, res = tipc_sk_withdraw(tsk, 0, NULL); goto exit; } - + if (tsk->group) { + res = -EACCES; + goto exit; + } if (uaddr_len < sizeof(struct sockaddr_tipc)) { res = -EINVAL; goto exit; @@ -698,6 +705,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; u32 mask = 0; sock_poll_wait(file, sk_sleep(sk), wait); @@ -718,8 +726,9 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, mask |= (POLLIN | POLLRDNORM); break; case TIPC_OPEN: - if (!tsk->cong_link_cnt) - mask |= POLLOUT; + if (!grp || tipc_group_size(grp)) + if (!tsk->cong_link_cnt) + mask |= POLLOUT; if (tipc_sk_type_connectionless(sk) && (!skb_queue_empty(&sk->sk_receive_queue))) mask |= (POLLIN | POLLRDNORM); @@ -757,6 +766,9 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, struct tipc_nlist dsts; int rc; + if (tsk->group) + return -EACCES; + /* Block or return if any destination link is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); if (unlikely(rc)) @@ -793,6 +805,64 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, return rc ? rc : dlen; } +/** + * tipc_send_group_bcast - send message to all members in communication group + * @sk: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct tipc_nlist *dsts = tipc_group_dests(grp); + struct tipc_mc_method *method = &tsk->mc_method; + struct tipc_msg *hdr = &tsk->phdr; + int mtu = tipc_bcast_get_mtu(net); + struct sk_buff_head pkts; + int rc = -EHOSTUNREACH; + + if (!dsts->local && !dsts->remote) + return -EHOSTUNREACH; + + /* Block or return if any destination link is congested */ + rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); + if (unlikely(rc)) + return rc; + + /* Complete message header */ + msg_set_type(hdr, TIPC_GRP_BCAST_MSG); + msg_set_hdr_sz(hdr, MCAST_H_SIZE); + msg_set_destport(hdr, 0); + msg_set_destnode(hdr, 0); + msg_set_nameinst(hdr, 0); + msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp)); + + /* Build message as chain of buffers */ + skb_queue_head_init(&pkts); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; + + /* Send message */ + rc = tipc_mcast_xmit(net, &pkts, method, dsts, + &tsk->cong_link_cnt); + if (unlikely(rc)) + return rc; + + /* Update broadcast sequence number */ + tipc_group_update_bc_members(tsk->group); + + return dlen; +} + /** * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets * @arrvq: queue with arriving messages, to be cloned after destination lookup @@ -803,13 +873,15 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { - struct tipc_msg *msg; - struct list_head dports; - u32 portid; u32 scope = TIPC_CLUSTER_SCOPE; - struct sk_buff_head tmpq; - uint hsz; + u32 self = tipc_own_addr(net); struct sk_buff *skb, *_skb; + u32 lower = 0, upper = ~0; + struct sk_buff_head tmpq; + u32 portid, oport, onode; + struct list_head dports; + struct tipc_msg *msg; + int hsz; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); @@ -818,14 +890,18 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { msg = buf_msg(skb); hsz = skb_headroom(skb) + msg_hdr_sz(msg); - - if (in_own_node(net, msg_orignode(msg))) + oport = msg_origport(msg); + onode = msg_orignode(msg); + if (onode == self) scope = TIPC_NODE_SCOPE; /* Create destination port list and message clones: */ - tipc_nametbl_mc_translate(net, - msg_nametype(msg), msg_namelower(msg), - msg_nameupper(msg), scope, &dports); + if (!msg_in_group(msg)) { + lower = msg_namelower(msg); + upper = msg_nameupper(msg); + } + tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper, + scope, &dports); while (tipc_dest_pop(&dports, NULL, &portid)) { _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); if (_skb) { @@ -895,10 +971,6 @@ exit: kfree_skb(skb); } -static void tipc_sk_top_evt(struct tipc_sock *tsk, struct tipc_event *evt) -{ -} - /** * tipc_sendmsg - send message in connectionless manner * @sock: socket structure @@ -934,6 +1006,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); struct list_head *clinks = &tsk->cong_links; bool syn = !tipc_sk_type_connectionless(sk); + struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; @@ -944,6 +1017,9 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; + if (unlikely(grp)) + return tipc_send_group_bcast(sock, m, dlen, timeout); + if (unlikely(!dest)) { dest = &tsk->peer; if (!syn || dest->family != AF_TIPC) @@ -1543,6 +1619,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, struct sk_buff *skb = __skb_dequeue(inputq); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); + struct tipc_group *grp = tsk->group; switch (msg_user(hdr)) { case CONN_MANAGER: @@ -1553,8 +1630,12 @@ static void tipc_sk_proto_rcv(struct sock *sk, tsk->cong_link_cnt--; sk->sk_write_space(sk); break; + case GROUP_PROTOCOL: + tipc_group_proto_rcv(grp, hdr, xmitq); + break; case TOP_SRV: - tipc_sk_top_evt(tsk, (void *)msg_data(hdr)); + tipc_group_member_evt(tsk->group, skb, xmitq); + skb = NULL; break; default: break; @@ -1699,6 +1780,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, { bool sk_conn = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = buf_msg(skb); struct net *net = sock_net(sk); struct sk_buff_head inputq; @@ -1710,15 +1792,19 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, if (unlikely(!msg_isdata(hdr))) tipc_sk_proto_rcv(sk, &inputq, xmitq); - else if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) + else if (unlikely(msg_type(hdr) > TIPC_GRP_BCAST_MSG)) return kfree_skb(skb); + if (unlikely(grp)) + tipc_group_filter_msg(grp, &inputq, xmitq); + /* Validate and add to receive buffer if there is space */ while ((skb = __skb_dequeue(&inputq))) { hdr = buf_msg(skb); limit = rcvbuf_limit(sk, skb); if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) || - (!sk_conn && msg_connected(hdr))) + (!sk_conn && msg_connected(hdr)) || + (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) err = TIPC_ERR_OVERLOAD; @@ -1837,7 +1923,6 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) sock_put(sk); continue; } - /* No destination socket => dequeue skb if still there */ skb = tipc_skb_dequeue(inputq, dport); if (!skb) @@ -1905,6 +1990,11 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, lock_sock(sk); + if (tsk->group) { + res = -EINVAL; + goto exit; + } + if (dst->family == AF_UNSPEC) { memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); if (!tipc_sk_type_connectionless(sk)) @@ -2341,6 +2431,52 @@ void tipc_sk_rht_destroy(struct net *net) rhashtable_destroy(&tn->sk_rht); } +static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) +{ + struct net *net = sock_net(&tsk->sk); + u32 domain = addr_domain(net, mreq->scope); + struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; + struct tipc_name_seq seq; + int rc; + + if (mreq->type < TIPC_RESERVED_TYPES) + return -EACCES; + if (grp) + return -EACCES; + grp = tipc_group_create(net, tsk->portid, mreq); + if (!grp) + return -ENOMEM; + tsk->group = grp; + msg_set_lookup_scope(hdr, mreq->scope); + msg_set_nametype(hdr, mreq->type); + msg_set_dest_droppable(hdr, true); + seq.type = mreq->type; + seq.lower = mreq->instance; + seq.upper = seq.lower; + tipc_nametbl_build_group(net, grp, mreq->type, domain); + rc = tipc_sk_publish(tsk, mreq->scope, &seq); + if (rc) + tipc_group_delete(net, grp); + return rc; +} + +static int tipc_sk_leave(struct tipc_sock *tsk) +{ + struct net *net = sock_net(&tsk->sk); + struct tipc_group *grp = tsk->group; + struct tipc_name_seq seq; + int scope; + + if (!grp) + return -EINVAL; + tipc_group_self(grp, &seq, &scope); + tipc_group_delete(net, grp); + tsk->group = NULL; + tipc_sk_withdraw(tsk, scope, &seq); + return 0; +} + /** * tipc_setsockopt - set socket option * @sock: socket structure @@ -2359,6 +2495,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group_req mreq; u32 value = 0; int res = 0; @@ -2374,9 +2511,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_CONN_TIMEOUT: if (ol < sizeof(value)) return -EINVAL; - res = get_user(value, (u32 __user *)ov); - if (res) - return res; + if (get_user(value, (u32 __user *)ov)) + return -EFAULT; + break; + case TIPC_GROUP_JOIN: + if (ol < sizeof(mreq)) + return -EINVAL; + if (copy_from_user(&mreq, ov, sizeof(mreq))) + return -EFAULT; break; default: if (ov || ol) @@ -2409,6 +2551,12 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, tsk->mc_method.rcast = true; tsk->mc_method.mandatory = true; break; + case TIPC_GROUP_JOIN: + res = tipc_sk_join(tsk, &mreq); + break; + case TIPC_GROUP_LEAVE: + res = tipc_sk_leave(tsk); + break; default: res = -EINVAL; } @@ -2436,7 +2584,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - int len; + struct tipc_name_seq seq; + int len, scope; u32 value; int res; @@ -2470,6 +2619,12 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, case TIPC_SOCK_RECVQ_DEPTH: value = skb_queue_len(&sk->sk_receive_queue); break; + case TIPC_GROUP_JOIN: + seq.type = 0; + if (tsk->group) + tipc_group_self(tsk->group, &seq, &scope); + value = seq.type; + break; default: res = -EINVAL; } -- cgit v1.2.3 From 31c82a2d9d51fccbb85cbd2be983eb115225301c Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:24 +0200 Subject: tipc: add second source address to recvmsg()/recvfrom() With group communication, it becomes important for a message receiver to identify not only from which socket (identfied by a node:port tuple) the message was sent, but also the logical identity (type:instance) of the sending member. We fix this by adding a second instance of struct sockaddr_tipc to the source address area when a message is read. The extra address struct is filled in with data found in the received message header (type,) and in the local member representation struct (instance.) Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 3 +++ net/tipc/msg.h | 1 + net/tipc/socket.c | 53 +++++++++++++++++++++++++++++++++++------------------ 3 files changed, 39 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 3f0e1ce1e3b9..beb214a3420c 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -61,6 +61,7 @@ struct tipc_member { struct list_head list; u32 node; u32 port; + u32 instance; enum mbr_state state; u16 bc_rcv_nxt; }; @@ -282,6 +283,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, if (!tipc_group_is_receiver(m)) goto drop; + TIPC_SKB_CB(skb)->orig_member = m->instance; __skb_queue_tail(inputq, skb); m->bc_rcv_nxt = msg_grp_bc_seqno(hdr) + 1; @@ -388,6 +390,7 @@ void tipc_group_member_evt(struct tipc_group *grp, m->state = MBR_PUBLISHED; else m->state = MBR_JOINED; + m->instance = evt->found_lower; tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); } else if (evt->event == TIPC_WITHDRAWN) { if (!m) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index dad400935405..e438716d2372 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -100,6 +100,7 @@ struct plist; struct tipc_skb_cb { u32 bytes_read; + u32 orig_member; struct sk_buff *tail; bool validated; u16 chain_imp; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 64bbf9d03629..25ecf1201527 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -62,6 +62,11 @@ enum { TIPC_CONNECTING = TCP_SYN_SENT, }; +struct sockaddr_pair { + struct sockaddr_tipc sock; + struct sockaddr_tipc member; +}; + /** * struct tipc_sock - TIPC socket structure * @sk: socket - interacts with 'port' and with user via the socket API @@ -1222,26 +1227,38 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, } /** - * set_orig_addr - capture sender's address for received message + * tipc_sk_set_orig_addr - capture sender's address for received message * @m: descriptor for message info - * @msg: received message header + * @hdr: received message header * * Note: Address is not captured if not requested by receiver. */ -static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) -{ - DECLARE_SOCKADDR(struct sockaddr_tipc *, addr, m->msg_name); - - if (addr) { - addr->family = AF_TIPC; - addr->addrtype = TIPC_ADDR_ID; - memset(&addr->addr, 0, sizeof(addr->addr)); - addr->addr.id.ref = msg_origport(msg); - addr->addr.id.node = msg_orignode(msg); - addr->addr.name.domain = 0; /* could leave uninitialized */ - addr->scope = 0; /* could leave uninitialized */ - m->msg_namelen = sizeof(struct sockaddr_tipc); - } +static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) +{ + DECLARE_SOCKADDR(struct sockaddr_pair *, srcaddr, m->msg_name); + struct tipc_msg *hdr = buf_msg(skb); + + if (!srcaddr) + return; + + srcaddr->sock.family = AF_TIPC; + srcaddr->sock.addrtype = TIPC_ADDR_ID; + srcaddr->sock.addr.id.ref = msg_origport(hdr); + srcaddr->sock.addr.id.node = msg_orignode(hdr); + srcaddr->sock.addr.name.domain = 0; + srcaddr->sock.scope = 0; + m->msg_namelen = sizeof(struct sockaddr_tipc); + + if (!msg_in_group(hdr)) + return; + + /* Group message users may also want to know sending member's id */ + srcaddr->member.family = AF_TIPC; + srcaddr->member.addrtype = TIPC_ADDR_NAME; + srcaddr->member.addr.name.name.type = msg_nametype(hdr); + srcaddr->member.addr.name.name.instance = TIPC_SKB_CB(skb)->orig_member; + srcaddr->member.addr.name.domain = 0; + m->msg_namelen = sizeof(*srcaddr); } /** @@ -1432,7 +1449,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, } while (1); /* Collect msg meta data, including error code and rejected data */ - set_orig_addr(m, hdr); + tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, hdr, tsk); if (unlikely(rc)) goto exit; @@ -1526,7 +1543,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Collect msg meta data, incl. error code and rejected data */ if (!copied) { - set_orig_addr(m, hdr); + tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, hdr, tsk); if (rc) break; -- cgit v1.2.3 From ae236fb208a6fbbd2e7a6913385e8fb78ac807f8 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:25 +0200 Subject: tipc: receive group membership events via member socket Like with any other service, group members' availability can be subscribed for by connecting to be topology server. However, because the events arrive via a different socket than the member socket, there is a real risk that membership events my arrive out of synch with the actual JOIN/LEAVE action. I.e., it is possible to receive the first messages from a new member before the corresponding JOIN event arrives, just as it is possible to receive the last messages from a leaving member after the LEAVE event has already been received. Since each member socket is internally also subscribing for membership events, we now fix this problem by passing those events on to the user via the member socket. We leverage the already present member synch- ronization protocol to guarantee correct message/event order. An event is delivered to the user as an empty message where the two source addresses identify the new/lost member. Furthermore, we set the MSG_OOB bit in the message flags to mark it as an event. If the event is an indication about a member loss we also set the MSG_EOR bit, so it can be distinguished from a member addition event. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 60 +++++++++++++++++++++++++++++++++++++++++++------------ net/tipc/group.h | 2 ++ net/tipc/msg.h | 22 ++++++++++++++++++-- net/tipc/socket.c | 49 ++++++++++++++++++++++++++++----------------- 4 files changed, 100 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index beb214a3420c..1bfa9348b26d 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -59,6 +59,7 @@ enum mbr_state { struct tipc_member { struct rb_node tree_node; struct list_head list; + struct sk_buff *event_msg; u32 node; u32 port; u32 instance; @@ -79,6 +80,7 @@ struct tipc_group { u16 member_cnt; u16 bc_snd_nxt; bool loopback; + bool events; }; static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, @@ -117,6 +119,7 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; + grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid)) return grp; kfree(grp); @@ -279,6 +282,13 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, if (!msg_in_group(hdr)) goto drop; + if (mtyp == TIPC_GRP_MEMBER_EVT) { + if (!grp->events) + goto drop; + __skb_queue_tail(inputq, skb); + return; + } + m = tipc_group_find_member(grp, node, port); if (!tipc_group_is_receiver(m)) goto drop; @@ -311,6 +321,7 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, } void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr, + struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { u32 node = msg_orignode(hdr); @@ -332,10 +343,12 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr, m->bc_rcv_nxt = msg_grp_bc_syncpt(hdr); /* Wait until PUBLISH event is received */ - if (m->state == MBR_DISCOVERED) + if (m->state == MBR_DISCOVERED) { m->state = MBR_JOINING; - else if (m->state == MBR_PUBLISHED) + } else if (m->state == MBR_PUBLISHED) { m->state = MBR_JOINED; + __skb_queue_tail(inputq, m->event_msg); + } return; case GRP_LEAVE_MSG: if (!m) @@ -347,6 +360,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr, return; } /* Otherwise deliver already received WITHDRAW event */ + __skb_queue_tail(inputq, m->event_msg); tipc_group_delete_member(grp, m); return; default: @@ -354,16 +368,17 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr, } } -/* tipc_group_member_evt() - receive and handle a member up/down event - */ void tipc_group_member_evt(struct tipc_group *grp, struct sk_buff *skb, + struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); struct tipc_event *evt = (void *)msg_data(hdr); + u32 instance = evt->found_lower; u32 node = evt->port.node; u32 port = evt->port.ref; + int event = evt->event; struct tipc_member *m; struct net *net; u32 self; @@ -376,32 +391,51 @@ void tipc_group_member_evt(struct tipc_group *grp, if (!grp->loopback && node == self && port == grp->portid) goto drop; + /* Convert message before delivery to user */ + msg_set_hdr_sz(hdr, GROUP_H_SIZE); + msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE); + msg_set_type(hdr, TIPC_GRP_MEMBER_EVT); + msg_set_origport(hdr, port); + msg_set_orignode(hdr, node); + msg_set_nametype(hdr, grp->type); + msg_set_grp_evt(hdr, event); + m = tipc_group_find_member(grp, node, port); - if (evt->event == TIPC_PUBLISHED) { + if (event == TIPC_PUBLISHED) { if (!m) m = tipc_group_create_member(grp, node, port, MBR_DISCOVERED); if (!m) goto drop; - /* Wait if JOIN message not yet received */ - if (m->state == MBR_DISCOVERED) + /* Hold back event if JOIN message not yet received */ + if (m->state == MBR_DISCOVERED) { + m->event_msg = skb; m->state = MBR_PUBLISHED; - else + } else { + __skb_queue_tail(inputq, skb); m->state = MBR_JOINED; - m->instance = evt->found_lower; + } + m->instance = instance; + TIPC_SKB_CB(skb)->orig_member = m->instance; tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); - } else if (evt->event == TIPC_WITHDRAWN) { + } else if (event == TIPC_WITHDRAWN) { if (!m) goto drop; - /* Keep back event if more messages might be expected */ - if (m->state != MBR_LEAVING && tipc_node_is_up(net, node)) + TIPC_SKB_CB(skb)->orig_member = m->instance; + + /* Hold back event if more messages might be expected */ + if (m->state != MBR_LEAVING && tipc_node_is_up(net, node)) { + m->event_msg = skb; m->state = MBR_LEAVING; - else + } else { + __skb_queue_tail(inputq, skb); tipc_group_delete_member(grp, m); + } } + return; drop: kfree_skb(skb); } diff --git a/net/tipc/group.h b/net/tipc/group.h index 9bdf4479fc03..5d3f10d28967 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -54,9 +54,11 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *xmitq); void tipc_group_member_evt(struct tipc_group *grp, struct sk_buff *skb, + struct sk_buff_head *inputq, struct sk_buff_head *xmitq); void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr, + struct sk_buff_head *inputq, struct sk_buff_head *xmitq); void tipc_group_update_bc_members(struct tipc_group *grp); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index e438716d2372..1b527b154e46 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -65,7 +65,8 @@ struct plist; #define TIPC_MCAST_MSG 1 #define TIPC_NAMED_MSG 2 #define TIPC_DIRECT_MSG 3 -#define TIPC_GRP_BCAST_MSG 4 +#define TIPC_GRP_MEMBER_EVT 4 +#define TIPC_GRP_BCAST_MSG 5 /* * Internal message users @@ -258,7 +259,14 @@ static inline void msg_set_type(struct tipc_msg *m, u32 n) static inline int msg_in_group(struct tipc_msg *m) { - return (msg_type(m) == TIPC_GRP_BCAST_MSG); + int mtyp = msg_type(m); + + return (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MEMBER_EVT); +} + +static inline bool msg_is_grp_evt(struct tipc_msg *m) +{ + return msg_type(m) == TIPC_GRP_MEMBER_EVT; } static inline u32 msg_named(struct tipc_msg *m) @@ -824,6 +832,16 @@ static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) /* Word 10 */ +static inline u16 msg_grp_evt(struct tipc_msg *m) +{ + return msg_bits(m, 10, 0, 0x3); +} + +static inline void msg_set_grp_evt(struct tipc_msg *m, int n) +{ + msg_set_bits(m, 10, 0, 0x3, n); +} + static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) { return msg_bits(m, 10, 16, 0xffff); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 25ecf1201527..0a2eac309177 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -709,41 +709,43 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; - u32 mask = 0; + u32 revents = 0; sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP | POLLIN | POLLRDNORM; + revents |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) - mask |= POLLHUP; + revents |= POLLHUP; switch (sk->sk_state) { case TIPC_ESTABLISHED: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) - mask |= POLLOUT; + revents |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: case TIPC_CONNECTING: - if (!skb_queue_empty(&sk->sk_receive_queue)) - mask |= (POLLIN | POLLRDNORM); + if (skb) + revents |= POLLIN | POLLRDNORM; break; case TIPC_OPEN: if (!grp || tipc_group_size(grp)) if (!tsk->cong_link_cnt) - mask |= POLLOUT; - if (tipc_sk_type_connectionless(sk) && - (!skb_queue_empty(&sk->sk_receive_queue))) - mask |= (POLLIN | POLLRDNORM); + revents |= POLLOUT; + if (!tipc_sk_type_connectionless(sk)) + break; + if (!skb) + break; + revents |= POLLIN | POLLRDNORM; break; case TIPC_DISCONNECTING: - mask = (POLLIN | POLLRDNORM | POLLHUP); + revents = POLLIN | POLLRDNORM | POLLHUP; break; } - - return mask; + return revents; } /** @@ -1415,11 +1417,12 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buflen, int flags) { struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - struct sk_buff *skb; - struct tipc_msg *hdr; bool connected = !tipc_sk_type_connectionless(sk); + struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct tipc_msg *hdr; + struct sk_buff *skb; + bool grp_evt; long timeout; /* Catch invalid receive requests */ @@ -1443,6 +1446,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); err = msg_errcode(hdr); + grp_evt = msg_is_grp_evt(hdr); if (likely(dlen || err)) break; tsk_advance_rx_queue(sk); @@ -1469,11 +1473,20 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(rc)) goto exit; + /* Mark message as group event if applicable */ + if (unlikely(grp_evt)) { + if (msg_grp_evt(hdr) == TIPC_WITHDRAWN) + m->msg_flags |= MSG_EOR; + m->msg_flags |= MSG_OOB; + copy = 0; + } + /* Caption of data or error code/rejected data was successful */ if (unlikely(flags & MSG_PEEK)) goto exit; tsk_advance_rx_queue(sk); + if (likely(!connected)) goto exit; @@ -1648,10 +1661,10 @@ static void tipc_sk_proto_rcv(struct sock *sk, sk->sk_write_space(sk); break; case GROUP_PROTOCOL: - tipc_group_proto_rcv(grp, hdr, xmitq); + tipc_group_proto_rcv(grp, hdr, inputq, xmitq); break; case TOP_SRV: - tipc_group_member_evt(tsk->group, skb, xmitq); + tipc_group_member_evt(tsk->group, skb, inputq, xmitq); skb = NULL; break; default: -- cgit v1.2.3 From b7d42635517fde2b095deddd0fba37be2302a285 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:26 +0200 Subject: tipc: introduce flow control for group broadcast messages We introduce an end-to-end flow control mechanism for group broadcast messages. This ensures that no messages are ever lost because of destination receive buffer overflow, with minimal impact on performance. For now, the algorithm is based on the assumption that there is only one active transmitter at any moment in time. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- net/tipc/group.h | 11 ++-- net/tipc/msg.h | 5 +- net/tipc/socket.c | 48 +++++++++++++----- 4 files changed, 190 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 1bfa9348b26d..b8ed70abba01 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -46,6 +46,7 @@ #define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) #define ADV_IDLE ADV_UNIT +#define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { MBR_QUARANTINED, @@ -59,16 +60,22 @@ enum mbr_state { struct tipc_member { struct rb_node tree_node; struct list_head list; + struct list_head congested; struct sk_buff *event_msg; + struct tipc_group *group; u32 node; u32 port; u32 instance; enum mbr_state state; + u16 advertised; + u16 window; u16 bc_rcv_nxt; + bool usr_pending; }; struct tipc_group { struct rb_root members; + struct list_head congested; struct tipc_nlist dests; struct net *net; int subid; @@ -86,11 +93,24 @@ struct tipc_group { static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); +static int tipc_group_rcvbuf_limit(struct tipc_group *grp) +{ + int mcnt = grp->member_cnt + 1; + + /* Scale to bytes, considering worst-case truesize/msgsize ratio */ + return mcnt * ADV_ACTIVE * FLOWCTL_BLK_SZ * 4; +} + u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) { return grp->bc_snd_nxt; } +static bool tipc_group_is_enabled(struct tipc_member *m) +{ + return m->state != MBR_QUARANTINED && m->state != MBR_LEAVING; +} + static bool tipc_group_is_receiver(struct tipc_member *m) { return m && m->state >= MBR_JOINED; @@ -111,6 +131,7 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, if (!grp) return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); + INIT_LIST_HEAD(&grp->congested); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; @@ -213,6 +234,8 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, if (!m) return NULL; INIT_LIST_HEAD(&m->list); + INIT_LIST_HEAD(&m->congested); + m->group = grp; m->node = node; m->port = port; grp->member_cnt++; @@ -233,6 +256,7 @@ static void tipc_group_delete_member(struct tipc_group *grp, rb_erase(&m->tree_node, &grp->members); grp->member_cnt--; list_del_init(&m->list); + list_del_init(&m->congested); /* If last member on a node, remove node from dest list */ if (!tipc_group_find_node(grp, m->node)) @@ -255,11 +279,59 @@ void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, *scope = grp->scope; } -void tipc_group_update_bc_members(struct tipc_group *grp) +void tipc_group_update_member(struct tipc_member *m, int len) +{ + struct tipc_group *grp = m->group; + struct tipc_member *_m, *tmp; + + if (!tipc_group_is_enabled(m)) + return; + + m->window -= len; + + if (m->window >= ADV_IDLE) + return; + + if (!list_empty(&m->congested)) + return; + + /* Sort member into congested members' list */ + list_for_each_entry_safe(_m, tmp, &grp->congested, congested) { + if (m->window > _m->window) + continue; + list_add_tail(&m->congested, &_m->congested); + return; + } + list_add_tail(&m->congested, &grp->congested); +} + +void tipc_group_update_bc_members(struct tipc_group *grp, int len) { + struct tipc_member *m; + struct rb_node *n; + + for (n = rb_first(&grp->members); n; n = rb_next(n)) { + m = container_of(n, struct tipc_member, tree_node); + if (tipc_group_is_enabled(m)) + tipc_group_update_member(m, len); + } grp->bc_snd_nxt++; } +bool tipc_group_bc_cong(struct tipc_group *grp, int len) +{ + struct tipc_member *m; + + if (list_empty(&grp->congested)) + return false; + + m = list_first_entry(&grp->congested, struct tipc_member, congested); + if (m->window >= len) + return false; + + return true; +} + /* tipc_group_filter_msg() - determine if we should accept arriving message */ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, @@ -302,11 +374,36 @@ drop: kfree_skb(skb); } +void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, + u32 port, struct sk_buff_head *xmitq) +{ + struct tipc_member *m; + + m = tipc_group_find_member(grp, node, port); + if (!m) + return; + + m->advertised -= blks; + + switch (m->state) { + case MBR_JOINED: + if (m->advertised <= (ADV_ACTIVE - ADV_UNIT)) + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + break; + case MBR_DISCOVERED: + case MBR_JOINING: + case MBR_LEAVING: + default: + break; + } +} + static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq) { struct tipc_msg *hdr; struct sk_buff *skb; + int adv = 0; skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, m->node, tipc_own_addr(grp->net), @@ -314,14 +411,24 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, if (!skb) return; + if (m->state == MBR_JOINED) + adv = ADV_ACTIVE - m->advertised; + hdr = buf_msg(skb); - if (mtyp == GRP_JOIN_MSG) + + if (mtyp == GRP_JOIN_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); + msg_set_adv_win(hdr, adv); + m->advertised += adv; + } else if (mtyp == GRP_ADV_MSG) { + msg_set_adv_win(hdr, adv); + m->advertised += adv; + } __skb_queue_tail(xmitq, skb); } -void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr, - struct sk_buff_head *inputq, +void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, + struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { u32 node = msg_orignode(hdr); @@ -341,14 +448,22 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr, if (!m) return; m->bc_rcv_nxt = msg_grp_bc_syncpt(hdr); + m->window += msg_adv_win(hdr); /* Wait until PUBLISH event is received */ if (m->state == MBR_DISCOVERED) { m->state = MBR_JOINING; } else if (m->state == MBR_PUBLISHED) { m->state = MBR_JOINED; + *usr_wakeup = true; + m->usr_pending = false; + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); __skb_queue_tail(inputq, m->event_msg); } + if (m->window < ADV_IDLE) + tipc_group_update_member(m, 0); + else + list_del_init(&m->congested); return; case GRP_LEAVE_MSG: if (!m) @@ -361,14 +476,28 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr, } /* Otherwise deliver already received WITHDRAW event */ __skb_queue_tail(inputq, m->event_msg); + *usr_wakeup = m->usr_pending; tipc_group_delete_member(grp, m); + list_del_init(&m->congested); + return; + case GRP_ADV_MSG: + if (!m) + return; + m->window += msg_adv_win(hdr); + *usr_wakeup = m->usr_pending; + m->usr_pending = false; + list_del_init(&m->congested); return; default: pr_warn("Received unknown GROUP_PROTO message\n"); } } +/* tipc_group_member_evt() - receive and handle a member up/down event + */ void tipc_group_member_evt(struct tipc_group *grp, + bool *usr_wakeup, + int *sk_rcvbuf, struct sk_buff *skb, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) @@ -416,16 +545,25 @@ void tipc_group_member_evt(struct tipc_group *grp, } else { __skb_queue_tail(inputq, skb); m->state = MBR_JOINED; + *usr_wakeup = true; + m->usr_pending = false; } m->instance = instance; TIPC_SKB_CB(skb)->orig_member = m->instance; tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); + if (m->window < ADV_IDLE) + tipc_group_update_member(m, 0); + else + list_del_init(&m->congested); } else if (event == TIPC_WITHDRAWN) { if (!m) goto drop; TIPC_SKB_CB(skb)->orig_member = m->instance; + *usr_wakeup = m->usr_pending; + m->usr_pending = false; + /* Hold back event if more messages might be expected */ if (m->state != MBR_LEAVING && tipc_node_is_up(net, node)) { m->event_msg = skb; @@ -434,7 +572,9 @@ void tipc_group_member_evt(struct tipc_group *grp, __skb_queue_tail(inputq, skb); tipc_group_delete_member(grp, m); } + list_del_init(&m->congested); } + *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); return; drop: kfree_skb(skb); diff --git a/net/tipc/group.h b/net/tipc/group.h index 5d3f10d28967..0e2740e1da90 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -52,15 +52,18 @@ void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); -void tipc_group_member_evt(struct tipc_group *grp, - struct sk_buff *skb, +void tipc_group_member_evt(struct tipc_group *grp, bool *wakeup, + int *sk_rcvbuf, struct sk_buff *skb, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); -void tipc_group_proto_rcv(struct tipc_group *grp, +void tipc_group_proto_rcv(struct tipc_group *grp, bool *wakeup, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); -void tipc_group_update_bc_members(struct tipc_group *grp); +void tipc_group_update_bc_members(struct tipc_group *grp, int len); +bool tipc_group_bc_cong(struct tipc_group *grp, int len); +void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, + u32 port, struct sk_buff_head *xmitq); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); int tipc_group_size(struct tipc_group *grp); #endif diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 1b527b154e46..237d007499f9 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -538,6 +538,7 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) */ #define GRP_JOIN_MSG 0 #define GRP_LEAVE_MSG 1 +#define GRP_ADV_MSG 2 /* * Word 1 @@ -790,12 +791,12 @@ static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 16, 0xffff, n); } -static inline u32 msg_adv_win(struct tipc_msg *m) +static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } -static inline void msg_set_adv_win(struct tipc_msg *m, u32 n) +static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 0a2eac309177..50145c95ac96 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -201,6 +201,11 @@ static bool tsk_conn_cong(struct tipc_sock *tsk) return tsk->snt_unacked > tsk->snd_win; } +static u16 tsk_blocks(int len) +{ + return ((len / FLOWCTL_BLK_SZ) + 1); +} + /* tsk_blocks(): translate a buffer size in bytes to number of * advertisable blocks, taking into account the ratio truesize(len)/len * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ @@ -831,6 +836,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, struct tipc_group *grp = tsk->group; struct tipc_nlist *dsts = tipc_group_dests(grp); struct tipc_mc_method *method = &tsk->mc_method; + int blks = tsk_blocks(MCAST_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; int mtu = tipc_bcast_get_mtu(net); struct sk_buff_head pkts; @@ -839,14 +845,15 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, if (!dsts->local && !dsts->remote) return -EHOSTUNREACH; - /* Block or return if any destination link is congested */ - rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); + /* Block or return if any destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt && + !tipc_group_bc_cong(grp, blks)); if (unlikely(rc)) return rc; /* Complete message header */ msg_set_type(hdr, TIPC_GRP_BCAST_MSG); - msg_set_hdr_sz(hdr, MCAST_H_SIZE); + msg_set_hdr_sz(hdr, GROUP_H_SIZE); msg_set_destport(hdr, 0); msg_set_destnode(hdr, 0); msg_set_nameinst(hdr, 0); @@ -864,9 +871,8 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, if (unlikely(rc)) return rc; - /* Update broadcast sequence number */ - tipc_group_update_bc_members(tsk->group); - + /* Update broadcast sequence number and send windows */ + tipc_group_update_bc_members(tsk->group, blks); return dlen; } @@ -1024,7 +1030,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; - if (unlikely(grp)) + if (unlikely(grp && !dest)) return tipc_send_group_bcast(sock, m, dlen, timeout); if (unlikely(!dest)) { @@ -1420,6 +1426,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, bool connected = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct sk_buff_head xmitq; struct tipc_msg *hdr; struct sk_buff *skb; bool grp_evt; @@ -1436,8 +1443,8 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, } timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + /* Step rcv queue to first msg with data or error; wait if necessary */ do { - /* Look at first msg in receive queue; wait if necessary */ rc = tipc_wait_for_rcvmsg(sock, &timeout); if (unlikely(rc)) goto exit; @@ -1485,12 +1492,21 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(flags & MSG_PEEK)) goto exit; + /* Send group flow control advertisement when applicable */ + if (tsk->group && msg_in_group(hdr) && !grp_evt) { + skb_queue_head_init(&xmitq); + tipc_group_update_rcv_win(tsk->group, tsk_blocks(hlen + dlen), + msg_orignode(hdr), msg_origport(hdr), + &xmitq); + tipc_node_distr_xmit(sock_net(sk), &xmitq); + } + tsk_advance_rx_queue(sk); if (likely(!connected)) goto exit; - /* Send connection flow control ack when applicable */ + /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); @@ -1650,6 +1666,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); struct tipc_group *grp = tsk->group; + bool wakeup = false; switch (msg_user(hdr)) { case CONN_MANAGER: @@ -1658,19 +1675,23 @@ static void tipc_sk_proto_rcv(struct sock *sk, case SOCK_WAKEUP: tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); tsk->cong_link_cnt--; - sk->sk_write_space(sk); + wakeup = true; break; case GROUP_PROTOCOL: - tipc_group_proto_rcv(grp, hdr, inputq, xmitq); + tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); break; case TOP_SRV: - tipc_group_member_evt(tsk->group, skb, inputq, xmitq); + tipc_group_member_evt(tsk->group, &wakeup, &sk->sk_rcvbuf, + skb, inputq, xmitq); skb = NULL; break; default: break; } + if (wakeup) + sk->sk_write_space(sk); + kfree_skb(skb); } @@ -1785,6 +1806,9 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); + if (unlikely(msg_in_group(hdr))) + return sk->sk_rcvbuf; + if (unlikely(!msg_connected(hdr))) return sk->sk_rcvbuf << msg_importance(hdr); -- cgit v1.2.3 From 27bd9ec027f396457d1a147043c92ff22fc4c71e Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:27 +0200 Subject: tipc: introduce group unicast messaging We now make it possible to send connectionless unicast messages within a communication group. To send a message, the sender can use either a direct port address, aka port identity, or an indirect port name to be looked up. This type of messages are subject to the same start synchronization and flow control mechanism as group broadcast messages. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 45 +++++++++++++++++++++- net/tipc/group.h | 3 ++ net/tipc/msg.h | 3 +- net/tipc/socket.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 150 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index b8ed70abba01..18440be5b5fc 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -186,6 +186,17 @@ struct tipc_member *tipc_group_find_member(struct tipc_group *grp, return NULL; } +static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, + u32 node, u32 port) +{ + struct tipc_member *m; + + m = tipc_group_find_member(grp, node, port); + if (m && tipc_group_is_enabled(m)) + return m; + return NULL; +} + static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, u32 node) { @@ -318,9 +329,39 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len) grp->bc_snd_nxt++; } -bool tipc_group_bc_cong(struct tipc_group *grp, int len) +bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, + int len, struct tipc_member **mbr) { + struct sk_buff_head xmitq; struct tipc_member *m; + int adv, state; + + m = tipc_group_find_dest(grp, dnode, dport); + *mbr = m; + if (!m) + return false; + if (m->usr_pending) + return true; + if (m->window >= len) + return false; + m->usr_pending = true; + + /* If not fully advertised, do it now to prevent mutual blocking */ + adv = m->advertised; + state = m->state; + if (state < MBR_JOINED) + return true; + if (state == MBR_JOINED && adv == ADV_IDLE) + return true; + skb_queue_head_init(&xmitq); + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); + tipc_node_distr_xmit(grp->net, &xmitq); + return true; +} + +bool tipc_group_bc_cong(struct tipc_group *grp, int len) +{ + struct tipc_member *m = NULL; if (list_empty(&grp->congested)) return false; @@ -329,7 +370,7 @@ bool tipc_group_bc_cong(struct tipc_group *grp, int len) if (m->window >= len) return false; - return true; + return tipc_group_cong(grp, m->node, m->port, len, &m); } /* tipc_group_filter_msg() - determine if we should accept arriving message diff --git a/net/tipc/group.h b/net/tipc/group.h index 0e2740e1da90..8f77290bb415 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -61,9 +61,12 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *wakeup, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); void tipc_group_update_bc_members(struct tipc_group *grp, int len); +bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, + int len, struct tipc_member **m); bool tipc_group_bc_cong(struct tipc_group *grp, int len); void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); +void tipc_group_update_member(struct tipc_member *m, int len); int tipc_group_size(struct tipc_group *grp); #endif diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 237d007499f9..f5033f4a7951 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -67,6 +67,7 @@ struct plist; #define TIPC_DIRECT_MSG 3 #define TIPC_GRP_MEMBER_EVT 4 #define TIPC_GRP_BCAST_MSG 5 +#define TIPC_GRP_UCAST_MSG 6 /* * Internal message users @@ -261,7 +262,7 @@ static inline int msg_in_group(struct tipc_msg *m) { int mtyp = msg_type(m); - return (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MEMBER_EVT); + return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; } static inline bool msg_is_grp_evt(struct tipc_msg *m) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 50145c95ac96..e71c8d23acb9 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -817,6 +817,93 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, return rc ? rc : dlen; } +/** + * tipc_send_group_msg - send a message to a member in the group + * @net: network namespace + * @m: message to send + * @mb: group member + * @dnode: destination node + * @dport: destination port + * @dlen: total length of message data + */ +static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, + struct msghdr *m, struct tipc_member *mb, + u32 dnode, u32 dport, int dlen) +{ + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_msg *hdr = &tsk->phdr; + struct sk_buff_head pkts; + int mtu, rc; + + /* Complete message header */ + msg_set_type(hdr, TIPC_GRP_UCAST_MSG); + msg_set_hdr_sz(hdr, GROUP_H_SIZE); + msg_set_destport(hdr, dport); + msg_set_destnode(hdr, dnode); + + /* Build message as chain of buffers */ + skb_queue_head_init(&pkts); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; + + /* Send message */ + rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + if (unlikely(rc == -ELINKCONG)) { + tipc_dest_push(&tsk->cong_links, dnode, 0); + tsk->cong_link_cnt++; + } + + /* Update send window and sequence number */ + tipc_group_update_member(mb, blks); + + return dlen; +} + +/** + * tipc_send_group_unicast - send message to a member in the group + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct net *net = sock_net(sk); + struct tipc_member *mb = NULL; + u32 node, port; + int rc; + + node = dest->addr.id.node; + port = dest->addr.id.ref; + if (!port && !node) + return -EHOSTUNREACH; + + /* Block or return if destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(&tsk->cong_links, node, 0) && + !tipc_group_cong(grp, node, port, blks, &mb)); + if (unlikely(rc)) + return rc; + + if (unlikely(!mb)) + return -EHOSTUNREACH; + + rc = tipc_send_group_msg(net, tsk, m, mb, node, port, dlen); + + return rc ? rc : dlen; +} + /** * tipc_send_group_bcast - send message to all members in communication group * @sk: socket structure @@ -1030,8 +1117,20 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; - if (unlikely(grp && !dest)) - return tipc_send_group_bcast(sock, m, dlen, timeout); + if (likely(dest)) { + if (unlikely(m->msg_namelen < sizeof(*dest))) + return -EINVAL; + if (unlikely(dest->family != AF_TIPC)) + return -EINVAL; + } + + if (grp) { + if (!dest) + return tipc_send_group_bcast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_ID) + return tipc_send_group_unicast(sock, m, dlen, timeout); + return -EINVAL; + } if (unlikely(!dest)) { dest = &tsk->peer; @@ -1039,12 +1138,6 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return -EDESTADDRREQ; } - if (unlikely(m->msg_namelen < sizeof(*dest))) - return -EINVAL; - - if (unlikely(dest->family != AF_TIPC)) - return -EINVAL; - if (unlikely(syn)) { if (sk->sk_state == TIPC_LISTEN) return -EPIPE; @@ -1077,7 +1170,6 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; - } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; msg_set_type(hdr, TIPC_DIRECT_MSG); @@ -1846,7 +1938,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, if (unlikely(!msg_isdata(hdr))) tipc_sk_proto_rcv(sk, &inputq, xmitq); - else if (unlikely(msg_type(hdr) > TIPC_GRP_BCAST_MSG)) + else if (unlikely(msg_type(hdr) > TIPC_GRP_UCAST_MSG)) return kfree_skb(skb); if (unlikely(grp)) -- cgit v1.2.3 From ee106d7f942dabce1352e01c6fe9ca4a720c2331 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:28 +0200 Subject: tipc: introduce group anycast messaging In this commit, we make it possible to send connectionless unicast messages to any member corresponding to the given member identity, when there is more than one such member. The sender must use a TIPC_ADDR_NAME address to achieve this effect. We also perform load balancing between the destinations, i.e., we primarily select one which has advertised sufficient send window to not cause a block/EAGAIN delay, if any. This mechanism is overlayed on the always present round-robin selection. Anycast messages are subject to the same start synchronization and flow control mechanism as group broadcast messages. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 7 +++++ net/tipc/group.h | 3 ++ net/tipc/name_table.c | 41 +++++++++++++++++++++++++ net/tipc/name_table.h | 3 ++ net/tipc/socket.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 138 insertions(+) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 18440be5b5fc..16aaaa97a005 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -116,6 +116,13 @@ static bool tipc_group_is_receiver(struct tipc_member *m) return m && m->state >= MBR_JOINED; } +u32 tipc_group_exclude(struct tipc_group *grp) +{ + if (!grp->loopback) + return grp->portid; + return 0; +} + int tipc_group_size(struct tipc_group *grp) { return grp->member_cnt; diff --git a/net/tipc/group.h b/net/tipc/group.h index 8f77290bb415..e432066a211e 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -49,6 +49,7 @@ void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port); struct tipc_nlist *tipc_group_dests(struct tipc_group *grp); void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, int *scope); +u32 tipc_group_exclude(struct tipc_group *grp); void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); @@ -68,5 +69,7 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); void tipc_group_update_member(struct tipc_member *m, int len); +struct tipc_member *tipc_group_find_sender(struct tipc_group *grp, + u32 node, u32 port); int tipc_group_size(struct tipc_group *grp); #endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 114d72bab827..2856e19e036e 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -597,6 +597,47 @@ not_found: return ref; } +bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, + struct list_head *dsts, int *dstcnt, u32 exclude, + bool all) +{ + u32 self = tipc_own_addr(net); + struct publication *publ; + struct name_info *info; + struct name_seq *seq; + struct sub_seq *sseq; + + if (!tipc_in_scope(domain, self)) + return false; + + *dstcnt = 0; + rcu_read_lock(); + seq = nametbl_find_seq(net, type); + if (unlikely(!seq)) + goto exit; + spin_lock_bh(&seq->lock); + sseq = nameseq_find_subseq(seq, instance); + if (likely(sseq)) { + info = sseq->info; + list_for_each_entry(publ, &info->zone_list, zone_list) { + if (!tipc_in_scope(domain, publ->node)) + continue; + if (publ->ref == exclude && publ->node == self) + continue; + tipc_dest_push(dsts, publ->node, publ->ref); + (*dstcnt)++; + if (all) + continue; + list_move_tail(&publ->zone_list, &info->zone_list); + break; + } + } + spin_unlock_bh(&seq->lock); +exit: + rcu_read_unlock(); + return !list_empty(dsts); +} + int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, u32 limit, struct list_head *dports) { diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 97646b17a4a2..71926e429446 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -107,6 +107,9 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, u32 upper, u32 domain, struct tipc_nlist *nodes); +bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, + struct list_head *dsts, int *dstcnt, u32 exclude, + bool all); struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, u32 upper, u32 scope, u32 port_ref, u32 key); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e71c8d23acb9..66165e12d2f4 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -904,6 +904,88 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, return rc ? rc : dlen; } +/** + * tipc_send_group_anycast - send message to any member with given identity + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct sock *sk = sock->sk; + struct tipc_sock *tsk = tipc_sk(sk); + struct list_head *cong_links = &tsk->cong_links; + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_group *grp = tsk->group; + struct tipc_member *first = NULL; + struct tipc_member *mbr = NULL; + struct net *net = sock_net(sk); + u32 node, port, exclude; + u32 type, inst, domain; + struct list_head dsts; + int lookups = 0; + int dstcnt, rc; + bool cong; + + INIT_LIST_HEAD(&dsts); + + type = dest->addr.name.name.type; + inst = dest->addr.name.name.instance; + domain = addr_domain(net, dest->scope); + exclude = tipc_group_exclude(grp); + + while (++lookups < 4) { + first = NULL; + + /* Look for a non-congested destination member, if any */ + while (1) { + if (!tipc_nametbl_lookup(net, type, inst, domain, &dsts, + &dstcnt, exclude, false)) + return -EHOSTUNREACH; + tipc_dest_pop(&dsts, &node, &port); + cong = tipc_group_cong(grp, node, port, blks, &mbr); + if (!cong) + break; + if (mbr == first) + break; + if (!first) + first = mbr; + } + + /* Start over if destination was not in member list */ + if (unlikely(!mbr)) + continue; + + if (likely(!cong && !tipc_dest_find(cong_links, node, 0))) + break; + + /* Block or return if destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(cong_links, node, 0) && + !tipc_group_cong(grp, node, port, + blks, &mbr)); + if (unlikely(rc)) + return rc; + + /* Send, unless destination disappeared while waiting */ + if (likely(mbr)) + break; + } + + if (unlikely(lookups >= 4)) + return -EHOSTUNREACH; + + rc = tipc_send_group_msg(net, tsk, m, mbr, node, port, dlen); + + return rc ? rc : dlen; +} + /** * tipc_send_group_bcast - send message to all members in communication group * @sk: socket structure @@ -1127,6 +1209,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (grp) { if (!dest) return tipc_send_group_bcast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_NAME) + return tipc_send_group_anycast(sock, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_ID) return tipc_send_group_unicast(sock, m, dlen, timeout); return -EINVAL; -- cgit v1.2.3 From 5b8dddb63769587badc50725ec9857caaeba4de0 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:29 +0200 Subject: tipc: introduce group multicast messaging The previously introduced message transport to all group members is based on the tipc multicast service, but is logically a broadcast service within the group, and that is what we call it. We now add functionality for sending messages to all group members having a certain identity. Correspondingly, we call this feature 'group multicast'. The service is using unicast when only one destination is found, otherwise it will use the bearer broadcast service to transfer the messages. In the latter case, the receiving members filter arriving messages by looking at the intended destination instance. If there is no match, the message will be dropped, while still being considered received and read as seen by the flow control mechanism. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 14 +++++++++++++- net/tipc/msg.h | 11 +++++++++-- net/tipc/socket.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 74 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 16aaaa97a005..ffac2f33fce2 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -413,10 +413,22 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, if (!tipc_group_is_receiver(m)) goto drop; + m->bc_rcv_nxt = msg_grp_bc_seqno(hdr) + 1; + + /* Drop multicast here if not for this member */ + if (mtyp == TIPC_GRP_MCAST_MSG) { + if (msg_nameinst(hdr) != grp->instance) { + m->bc_rcv_nxt = msg_grp_bc_seqno(hdr) + 1; + tipc_group_update_rcv_win(grp, msg_blocks(hdr), + node, port, xmitq); + kfree_skb(skb); + return; + } + } + TIPC_SKB_CB(skb)->orig_member = m->instance; __skb_queue_tail(inputq, skb); - m->bc_rcv_nxt = msg_grp_bc_seqno(hdr) + 1; return; drop: kfree_skb(skb); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index f5033f4a7951..d6f98215267e 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -67,7 +67,8 @@ struct plist; #define TIPC_DIRECT_MSG 3 #define TIPC_GRP_MEMBER_EVT 4 #define TIPC_GRP_BCAST_MSG 5 -#define TIPC_GRP_UCAST_MSG 6 +#define TIPC_GRP_MCAST_MSG 6 +#define TIPC_GRP_UCAST_MSG 7 /* * Internal message users @@ -195,6 +196,11 @@ static inline u32 msg_size(struct tipc_msg *m) return msg_bits(m, 0, 0, 0x1ffff); } +static inline u32 msg_blocks(struct tipc_msg *m) +{ + return (msg_size(m) / 1024) + 1; +} + static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); @@ -279,7 +285,8 @@ static inline u32 msg_mcast(struct tipc_msg *m) { int mtyp = msg_type(m); - return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG)); + return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || + (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 66165e12d2f4..8fdd969e12bd 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -999,6 +999,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); @@ -1021,11 +1022,16 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, return rc; /* Complete message header */ - msg_set_type(hdr, TIPC_GRP_BCAST_MSG); + if (dest) { + msg_set_type(hdr, TIPC_GRP_MCAST_MSG); + msg_set_nameinst(hdr, dest->addr.name.name.instance); + } else { + msg_set_type(hdr, TIPC_GRP_BCAST_MSG); + msg_set_nameinst(hdr, 0); + } msg_set_hdr_sz(hdr, GROUP_H_SIZE); msg_set_destport(hdr, 0); msg_set_destnode(hdr, 0); - msg_set_nameinst(hdr, 0); msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp)); /* Build message as chain of buffers */ @@ -1045,6 +1051,48 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, return dlen; } +/** + * tipc_send_group_mcast - send message to all members with given identity + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct tipc_name_seq *seq = &dest->addr.nameseq; + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct net *net = sock_net(sk); + u32 domain, exclude, dstcnt; + struct list_head dsts; + + INIT_LIST_HEAD(&dsts); + + if (seq->lower != seq->upper) + return -ENOTSUPP; + + domain = addr_domain(net, dest->scope); + exclude = tipc_group_exclude(grp); + if (!tipc_nametbl_lookup(net, seq->type, seq->lower, domain, + &dsts, &dstcnt, exclude, true)) + return -EHOSTUNREACH; + + if (dstcnt == 1) { + tipc_dest_pop(&dsts, &dest->addr.id.node, &dest->addr.id.ref); + return tipc_send_group_unicast(sock, m, dlen, timeout); + } + + tipc_dest_list_purge(&dsts); + return tipc_send_group_bcast(sock, m, dlen, timeout); +} + /** * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets * @arrvq: queue with arriving messages, to be cloned after destination lookup @@ -1213,6 +1261,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return tipc_send_group_anycast(sock, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_ID) return tipc_send_group_unicast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_MCAST) + return tipc_send_group_mcast(sock, m, dlen, timeout); return -EINVAL; } @@ -2022,8 +2072,6 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, if (unlikely(!msg_isdata(hdr))) tipc_sk_proto_rcv(sk, &inputq, xmitq); - else if (unlikely(msg_type(hdr) > TIPC_GRP_UCAST_MSG)) - return kfree_skb(skb); if (unlikely(grp)) tipc_group_filter_msg(grp, &inputq, xmitq); -- cgit v1.2.3 From b87a5ea31c935a7f7e11ca85df2ec7917921e96d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:30 +0200 Subject: tipc: guarantee group unicast doesn't bypass group broadcast Group unicast messages don't follow the same path as broadcast messages, and there is a high risk that unicasts sent from a socket might bypass previously sent broadcasts from the same socket. We fix this by letting all unicast messages carry the sequence number of the next sent broadcast from the same node, but without updating this number at the receiver. This way, a receiver can check and if necessary re-order such messages before they are added to the socket receive buffer. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 87 +++++++++++++++++++++++++++++++++++++++++++++---------- net/tipc/socket.c | 2 ++ 2 files changed, 74 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index ffac2f33fce2..985e0ce32e8e 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -62,6 +62,7 @@ struct tipc_member { struct list_head list; struct list_head congested; struct sk_buff *event_msg; + struct sk_buff_head deferredq; struct tipc_group *group; u32 node; u32 port; @@ -253,6 +254,7 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, return NULL; INIT_LIST_HEAD(&m->list); INIT_LIST_HEAD(&m->congested); + __skb_queue_head_init(&m->deferredq); m->group = grp; m->node = node; m->port = port; @@ -380,29 +382,54 @@ bool tipc_group_bc_cong(struct tipc_group *grp, int len) return tipc_group_cong(grp, m->node, m->port, len, &m); } +/* tipc_group_sort_msg() - sort msg into queue by bcast sequence number + */ +static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) +{ + struct tipc_msg *_hdr, *hdr = buf_msg(skb); + u16 bc_seqno = msg_grp_bc_seqno(hdr); + struct sk_buff *_skb, *tmp; + int mtyp = msg_type(hdr); + + /* Bcast may be bypassed by unicast, - sort it in */ + if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { + skb_queue_walk_safe(defq, _skb, tmp) { + _hdr = buf_msg(_skb); + if (!less(bc_seqno, msg_grp_bc_seqno(_hdr))) + continue; + __skb_queue_before(defq, _skb, skb); + return; + } + /* Bcast was not bypassed, - add to tail */ + } + /* Unicasts are never bypassed, - always add to tail */ + __skb_queue_tail(defq, skb); +} + /* tipc_group_filter_msg() - determine if we should accept arriving message */ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); + struct sk_buff_head *defq; struct tipc_member *m; struct tipc_msg *hdr; + bool deliver, update; u32 node, port; - int mtyp; + int mtyp, blks; if (!skb) return; hdr = buf_msg(skb); - mtyp = msg_type(hdr); node = msg_orignode(hdr); port = msg_origport(hdr); if (!msg_in_group(hdr)) goto drop; - if (mtyp == TIPC_GRP_MEMBER_EVT) { + if (msg_is_grp_evt(hdr)) { if (!grp->events) goto drop; __skb_queue_tail(inputq, skb); @@ -413,22 +440,52 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, if (!tipc_group_is_receiver(m)) goto drop; - m->bc_rcv_nxt = msg_grp_bc_seqno(hdr) + 1; + if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) + goto drop; - /* Drop multicast here if not for this member */ - if (mtyp == TIPC_GRP_MCAST_MSG) { - if (msg_nameinst(hdr) != grp->instance) { - m->bc_rcv_nxt = msg_grp_bc_seqno(hdr) + 1; - tipc_group_update_rcv_win(grp, msg_blocks(hdr), - node, port, xmitq); - kfree_skb(skb); - return; + TIPC_SKB_CB(skb)->orig_member = m->instance; + defq = &m->deferredq; + tipc_group_sort_msg(skb, defq); + + while ((skb = skb_peek(defq))) { + hdr = buf_msg(skb); + mtyp = msg_type(hdr); + deliver = true; + update = false; + + if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) + break; + + /* Decide what to do with message */ + switch (mtyp) { + case TIPC_GRP_MCAST_MSG: + if (msg_nameinst(hdr) != grp->instance) { + update = true; + deliver = false; + } + /* Fall thru */ + case TIPC_GRP_BCAST_MSG: + m->bc_rcv_nxt++; + break; + case TIPC_GRP_UCAST_MSG: + break; + default: + break; } - } - TIPC_SKB_CB(skb)->orig_member = m->instance; - __skb_queue_tail(inputq, skb); + /* Execute decisions */ + __skb_dequeue(defq); + if (deliver) + __skb_queue_tail(inputq, skb); + else + kfree_skb(skb); + + if (!update) + continue; + blks = msg_blocks(hdr); + tipc_group_update_rcv_win(grp, blks, node, port, xmitq); + } return; drop: kfree_skb(skb); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 8fdd969e12bd..3276b7a0d445 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -830,6 +830,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, struct msghdr *m, struct tipc_member *mb, u32 dnode, u32 dport, int dlen) { + u16 bc_snd_nxt = tipc_group_bc_snd_nxt(tsk->group); int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; struct sk_buff_head pkts; @@ -840,6 +841,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, msg_set_hdr_sz(hdr, GROUP_H_SIZE); msg_set_destport(hdr, dport); msg_set_destnode(hdr, dnode); + msg_set_grp_bc_seqno(hdr, bc_snd_nxt); /* Build message as chain of buffers */ skb_queue_head_init(&pkts); -- cgit v1.2.3 From 2f487712b89376fce267223bbb0db93d393d4b09 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:31 +0200 Subject: tipc: guarantee that group broadcast doesn't bypass group unicast We need a mechanism guaranteeing that group unicasts sent out from a socket are not bypassed by later sent broadcasts from the same socket. We do this as follows: - Each time a unicast is sent, we set a the broadcast method for the socket to "replicast" and "mandatory". This forces the first subsequent broadcast message to follow the same network and data path as the preceding unicast to a destination, hence preventing it from overtaking the latter. - In order to make the 'same data path' statement above true, we let group unicasts pass through the multicast link input queue, instead of as previously through the unicast link input queue. - In the first broadcast following a unicast, we set a new header flag, requiring all recipients to immediately acknowledge its reception. - During the period before all the expected acknowledges are received, the socket refuses to accept any more broadcast attempts, i.e., by blocking or returning EAGAIN. This period should typically not be longer than a few microseconds. - When all acknowledges have been received, the sending socket will open up for subsequent broadcasts, this time giving the link layer freedom to itself select the best transmission method. - The forced and/or abrupt transmission method changes described above may lead to broadcasts arriving out of order to the recipients. We remedy this by introducing code that checks and if necessary re-orders such messages at the receiving end. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 47 +++++++++++++++++++++++++++++++++++++++++------ net/tipc/group.h | 4 +--- net/tipc/link.c | 5 ++--- net/tipc/msg.h | 21 +++++++++++++++++++++ net/tipc/socket.c | 34 +++++++++++++++++++++++++++++----- 5 files changed, 94 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 985e0ce32e8e..eab862e047dc 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -71,6 +71,7 @@ struct tipc_member { u16 advertised; u16 window; u16 bc_rcv_nxt; + u16 bc_acked; bool usr_pending; }; @@ -87,6 +88,7 @@ struct tipc_group { u32 portid; u16 member_cnt; u16 bc_snd_nxt; + u16 bc_ackers; bool loopback; bool events; }; @@ -258,6 +260,7 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, m->group = grp; m->node = node; m->port = port; + m->bc_acked = grp->bc_snd_nxt - 1; grp->member_cnt++; tipc_group_add_to_tree(grp, m); tipc_nlist_add(&grp->dests, m->node); @@ -275,6 +278,11 @@ static void tipc_group_delete_member(struct tipc_group *grp, { rb_erase(&m->tree_node, &grp->members); grp->member_cnt--; + + /* Check if we were waiting for replicast ack from this member */ + if (grp->bc_ackers && less(m->bc_acked, grp->bc_snd_nxt - 1)) + grp->bc_ackers--; + list_del_init(&m->list); list_del_init(&m->congested); @@ -325,16 +333,23 @@ void tipc_group_update_member(struct tipc_member *m, int len) list_add_tail(&m->congested, &grp->congested); } -void tipc_group_update_bc_members(struct tipc_group *grp, int len) +void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) { + u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); - if (tipc_group_is_enabled(m)) + if (tipc_group_is_enabled(m)) { tipc_group_update_member(m, len); + m->bc_acked = prev; + } } + + /* Mark number of acknowledges to expect, if any */ + if (ack) + grp->bc_ackers = grp->member_cnt; grp->bc_snd_nxt++; } @@ -372,6 +387,10 @@ bool tipc_group_bc_cong(struct tipc_group *grp, int len) { struct tipc_member *m = NULL; + /* If prev bcast was replicast, reject until all receivers have acked */ + if (grp->bc_ackers) + return true; + if (list_empty(&grp->congested)) return false; @@ -391,7 +410,7 @@ static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) struct sk_buff *_skb, *tmp; int mtyp = msg_type(hdr); - /* Bcast may be bypassed by unicast, - sort it in */ + /* Bcast may be bypassed by unicast or other bcast, - sort it in */ if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); @@ -412,10 +431,10 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); + bool ack, deliver, update; struct sk_buff_head *defq; struct tipc_member *m; struct tipc_msg *hdr; - bool deliver, update; u32 node, port; int mtyp, blks; @@ -451,6 +470,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, hdr = buf_msg(skb); mtyp = msg_type(hdr); deliver = true; + ack = false; update = false; if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) @@ -466,6 +486,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, /* Fall thru */ case TIPC_GRP_BCAST_MSG: m->bc_rcv_nxt++; + ack = msg_grp_bc_ack_req(hdr); break; case TIPC_GRP_UCAST_MSG: break; @@ -480,6 +501,9 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, else kfree_skb(skb); + if (ack) + tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); + if (!update) continue; @@ -540,6 +564,8 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, } else if (mtyp == GRP_ADV_MSG) { msg_set_adv_win(hdr, adv); m->advertised += adv; + } else if (mtyp == GRP_ACK_MSG) { + msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); } __skb_queue_tail(xmitq, skb); } @@ -593,7 +619,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, } /* Otherwise deliver already received WITHDRAW event */ __skb_queue_tail(inputq, m->event_msg); - *usr_wakeup = m->usr_pending; + *usr_wakeup = true; tipc_group_delete_member(grp, m); list_del_init(&m->congested); return; @@ -605,6 +631,15 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, m->usr_pending = false; list_del_init(&m->congested); return; + case GRP_ACK_MSG: + if (!m) + return; + m->bc_acked = msg_grp_bc_acked(hdr); + if (--grp->bc_ackers) + break; + *usr_wakeup = true; + m->usr_pending = false; + return; default: pr_warn("Received unknown GROUP_PROTO message\n"); } @@ -678,7 +713,7 @@ void tipc_group_member_evt(struct tipc_group *grp, TIPC_SKB_CB(skb)->orig_member = m->instance; - *usr_wakeup = m->usr_pending; + *usr_wakeup = true; m->usr_pending = false; /* Hold back event if more messages might be expected */ diff --git a/net/tipc/group.h b/net/tipc/group.h index e432066a211e..d525e1cd7de5 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -61,7 +61,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *wakeup, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); -void tipc_group_update_bc_members(struct tipc_group *grp, int len); +void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack); bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int len, struct tipc_member **m); bool tipc_group_bc_cong(struct tipc_group *grp, int len); @@ -69,7 +69,5 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); void tipc_group_update_member(struct tipc_member *m, int len); -struct tipc_member *tipc_group_find_sender(struct tipc_group *grp, - u32 node, u32 port); int tipc_group_size(struct tipc_group *grp); #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index bd25bff63925..70a21499804d 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1046,13 +1046,12 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: - if (unlikely(msg_mcast(hdr))) { + if (unlikely(msg_in_group(hdr) || msg_mcast(hdr))) { skb_queue_tail(l->bc_rcvlink->inputq, skb); return true; } - case CONN_MANAGER: case GROUP_PROTOCOL: - skb_queue_tail(inputq, skb); + case CONN_MANAGER: return true; case NAME_DISTRIBUTOR: l->bc_rcvlink->state = LINK_ESTABLISHED; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index d6f98215267e..52c6a2e01995 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -547,6 +547,7 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) #define GRP_JOIN_MSG 0 #define GRP_LEAVE_MSG 1 #define GRP_ADV_MSG 2 +#define GRP_ACK_MSG 3 /* * Word 1 @@ -839,6 +840,16 @@ static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) msg_set_bits(m, 9, 16, 0xffff, n); } +static inline u16 msg_grp_bc_acked(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + /* Word 10 */ static inline u16 msg_grp_evt(struct tipc_msg *m) @@ -851,6 +862,16 @@ static inline void msg_set_grp_evt(struct tipc_msg *m, int n) msg_set_bits(m, 10, 0, 0x3, n); } +static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) +{ + return msg_bits(m, 10, 0, 0x1); +} + +static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) +{ + msg_set_bits(m, 10, 0, 0x1, n); +} + static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) { return msg_bits(m, 10, 16, 0xffff); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3276b7a0d445..b1f1c3c2b1e2 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -831,6 +831,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, u32 dnode, u32 dport, int dlen) { u16 bc_snd_nxt = tipc_group_bc_snd_nxt(tsk->group); + struct tipc_mc_method *method = &tsk->mc_method; int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; struct sk_buff_head pkts; @@ -857,9 +858,12 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, tsk->cong_link_cnt++; } - /* Update send window and sequence number */ + /* Update send window */ tipc_group_update_member(mb, blks); + /* A broadcast sent within next EXPIRE period must follow same path */ + method->rcast = true; + method->mandatory = true; return dlen; } @@ -1008,6 +1012,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, struct tipc_group *grp = tsk->group; struct tipc_nlist *dsts = tipc_group_dests(grp); struct tipc_mc_method *method = &tsk->mc_method; + bool ack = method->mandatory && method->rcast; int blks = tsk_blocks(MCAST_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; int mtu = tipc_bcast_get_mtu(net); @@ -1036,6 +1041,9 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, msg_set_destnode(hdr, 0); msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp)); + /* Avoid getting stuck with repeated forced replicasts */ + msg_set_grp_bc_ack_req(hdr, ack); + /* Build message as chain of buffers */ skb_queue_head_init(&pkts); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); @@ -1043,13 +1051,17 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, return rc; /* Send message */ - rc = tipc_mcast_xmit(net, &pkts, method, dsts, - &tsk->cong_link_cnt); + rc = tipc_mcast_xmit(net, &pkts, method, dsts, &tsk->cong_link_cnt); if (unlikely(rc)) return rc; /* Update broadcast sequence number and send windows */ - tipc_group_update_bc_members(tsk->group, blks); + tipc_group_update_bc_members(tsk->group, blks, ack); + + /* Broadcast link is now free to choose method for next broadcast */ + method->mandatory = false; + method->expires = jiffies; + return dlen; } @@ -1113,7 +1125,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, u32 portid, oport, onode; struct list_head dports; struct tipc_msg *msg; - int hsz; + int user, mtyp, hsz; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); @@ -1121,6 +1133,18 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { msg = buf_msg(skb); + user = msg_user(msg); + mtyp = msg_type(msg); + if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { + spin_lock_bh(&inputq->lock); + if (skb_peek(arrvq) == skb) { + __skb_dequeue(arrvq); + __skb_queue_tail(inputq, skb); + } + refcount_dec(&skb->users); + spin_unlock_bh(&inputq->lock); + continue; + } hsz = skb_headroom(skb) + msg_hdr_sz(msg); oport = msg_origport(msg); onode = msg_orignode(msg); -- cgit v1.2.3 From 399574d41963285e72ba28dd46783c96316a81d1 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:32 +0200 Subject: tipc: guarantee delivery of UP event before first broadcast The following scenario is possible: - A user joins a group, and immediately sends out a broadcast message to its members. - The broadcast message, following a different data path than the initial JOIN message sent out during the joining procedure, arrives to a receiver before the latter.. - The receiver drops the message, since it is not ready to accept any messages until the JOIN has arrived. We avoid this by treating group protocol JOIN messages like unicast messages. - We let them pass through the recipient's multicast input queue, just like ordinary unicasts. - We force the first following broadacst to be sent as replicated unicast and being acknowledged by the recipient before accepting any more broadcast transmissions. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/link.c | 7 +++++-- net/tipc/socket.c | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 70a21499804d..723dd6998684 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1039,6 +1039,7 @@ int tipc_link_retrans(struct tipc_link *l, struct tipc_link *nacker, static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *inputq) { + struct sk_buff_head *mc_inputq = l->bc_rcvlink->inputq; struct tipc_msg *hdr = buf_msg(skb); switch (msg_user(hdr)) { @@ -1047,12 +1048,14 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: if (unlikely(msg_in_group(hdr) || msg_mcast(hdr))) { - skb_queue_tail(l->bc_rcvlink->inputq, skb); + skb_queue_tail(mc_inputq, skb); return true; } - case GROUP_PROTOCOL: case CONN_MANAGER: return true; + case GROUP_PROTOCOL: + skb_queue_tail(mc_inputq, skb); + return true; case NAME_DISTRIBUTOR: l->bc_rcvlink->state = LINK_ESTABLISHED; skb_queue_tail(l->namedq, skb); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b1f1c3c2b1e2..2bbab4fe2f53 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2762,6 +2762,10 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) rc = tipc_sk_publish(tsk, mreq->scope, &seq); if (rc) tipc_group_delete(net, grp); + + /* Eliminate any risk that a broadcast overtakes the sent JOIN */ + tsk->mc_method.rcast = true; + tsk->mc_method.mandatory = true; return rc; } -- cgit v1.2.3 From a3bada70660fb020430135ec8a774ae1ea6bc9a9 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:33 +0200 Subject: tipc: guarantee delivery of last broadcast before DOWN event The following scenario is possible: - A user sends a broadcast message, and thereafter immediately leaves the group. - The LEAVE message, following a different path than the broadcast, arrives ahead of the broadcast, and the sending member is removed from the receiver's list. - The broadcast message arrives, but is dropped because the sender now is unknown to the receipient. We fix this by sequence numbering membership events, just like ordinary unicast messages. Currently, when a JOIN is sent to a peer, it contains a synchronization point, - the sequence number of the next sent broadcast, in order to give the receiver a start synchronization point. We now let even LEAVE messages contain such an "end synchronization" point, so that the recipient can delay the removal of the sending member until it knows that all messages have been received. The received synchronization points are added as sequence numbers to the generated membership events, making it possible to handle them almost the same way as regular unicasts in the receiving filter function. In particular, a DOWN event with a too high sequence number will be kept in the reordering queue until the missing broadcast(s) arrive and have been delivered. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index eab862e047dc..8f0eb5d22e8f 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -71,6 +71,7 @@ struct tipc_member { u16 advertised; u16 window; u16 bc_rcv_nxt; + u16 bc_syncpt; u16 bc_acked; bool usr_pending; }; @@ -410,7 +411,7 @@ static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) struct sk_buff *_skb, *tmp; int mtyp = msg_type(hdr); - /* Bcast may be bypassed by unicast or other bcast, - sort it in */ + /* Bcast/mcast may be bypassed by ucast or other bcast, - sort it in */ if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); @@ -431,7 +432,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); - bool ack, deliver, update; + bool ack, deliver, update, leave = false; struct sk_buff_head *defq; struct tipc_member *m; struct tipc_msg *hdr; @@ -448,13 +449,6 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, if (!msg_in_group(hdr)) goto drop; - if (msg_is_grp_evt(hdr)) { - if (!grp->events) - goto drop; - __skb_queue_tail(inputq, skb); - return; - } - m = tipc_group_find_member(grp, node, port); if (!tipc_group_is_receiver(m)) goto drop; @@ -490,6 +484,12 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, break; case TIPC_GRP_UCAST_MSG: break; + case TIPC_GRP_MEMBER_EVT: + if (m->state == MBR_LEAVING) + leave = true; + if (!grp->events) + deliver = false; + break; default: break; } @@ -504,6 +504,11 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, if (ack) tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); + if (leave) { + tipc_group_delete_member(grp, m); + __skb_queue_purge(defq); + break; + } if (!update) continue; @@ -561,6 +566,8 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); msg_set_adv_win(hdr, adv); m->advertised += adv; + } else if (mtyp == GRP_LEAVE_MSG) { + msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); } else if (mtyp == GRP_ADV_MSG) { msg_set_adv_win(hdr, adv); m->advertised += adv; @@ -577,6 +584,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); struct tipc_member *m; + struct tipc_msg *ehdr; if (!grp) return; @@ -590,7 +598,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, MBR_QUARANTINED); if (!m) return; - m->bc_rcv_nxt = msg_grp_bc_syncpt(hdr); + m->bc_syncpt = msg_grp_bc_syncpt(hdr); + m->bc_rcv_nxt = m->bc_syncpt; m->window += msg_adv_win(hdr); /* Wait until PUBLISH event is received */ @@ -601,6 +610,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, *usr_wakeup = true; m->usr_pending = false; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + ehdr = buf_msg(m->event_msg); + msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); __skb_queue_tail(inputq, m->event_msg); } if (m->window < ADV_IDLE) @@ -611,6 +622,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, case GRP_LEAVE_MSG: if (!m) return; + m->bc_syncpt = msg_grp_bc_syncpt(hdr); /* Wait until WITHDRAW event is received */ if (m->state != MBR_LEAVING) { @@ -618,9 +630,10 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, return; } /* Otherwise deliver already received WITHDRAW event */ + ehdr = buf_msg(m->event_msg); + msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); __skb_queue_tail(inputq, m->event_msg); *usr_wakeup = true; - tipc_group_delete_member(grp, m); list_del_init(&m->congested); return; case GRP_ADV_MSG: @@ -662,6 +675,7 @@ void tipc_group_member_evt(struct tipc_group *grp, int event = evt->event; struct tipc_member *m; struct net *net; + bool node_up; u32 self; if (!grp) @@ -695,6 +709,7 @@ void tipc_group_member_evt(struct tipc_group *grp, m->event_msg = skb; m->state = MBR_PUBLISHED; } else { + msg_set_grp_bc_seqno(hdr, m->bc_syncpt); __skb_queue_tail(inputq, skb); m->state = MBR_JOINED; *usr_wakeup = true; @@ -715,14 +730,18 @@ void tipc_group_member_evt(struct tipc_group *grp, *usr_wakeup = true; m->usr_pending = false; + node_up = tipc_node_is_up(net, node); /* Hold back event if more messages might be expected */ - if (m->state != MBR_LEAVING && tipc_node_is_up(net, node)) { + if (m->state != MBR_LEAVING && node_up) { m->event_msg = skb; m->state = MBR_LEAVING; } else { + if (node_up) + msg_set_grp_bc_seqno(hdr, m->bc_syncpt); + else + msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt); __skb_queue_tail(inputq, skb); - tipc_group_delete_member(grp, m); } list_del_init(&m->congested); } -- cgit v1.2.3 From 04d7b574b245c66001a33cb9da2c0311063af73f Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:34 +0200 Subject: tipc: add multipoint-to-point flow control We already have point-to-multipoint flow control within a group. But we even need the opposite; -a scheme which can handle that potentially hundreds of sources may try to send messages to the same destination simultaneously without causing buffer overflow at the recipient. This commit adds such a mechanism. The algorithm works as follows: - When a member detects a new, joining member, it initially set its state to JOINED and advertises a minimum window to the new member. This window is chosen so that the new member can send exactly one maximum sized message, or several smaller ones, to the recipient before it must stop and wait for an additional advertisement. This minimum window ADV_IDLE is set to 65 1kB blocks. - When a member receives the first data message from a JOINED member, it changes the state of the latter to ACTIVE, and advertises a larger window ADV_ACTIVE = 12 x ADV_IDLE blocks to the sender, so it can continue sending with minimal disturbances to the data flow. - The active members are kept in a dedicated linked list. Each time a message is received from an active member, it will be moved to the tail of that list. This way, we keep a record of which members have been most (tail) and least (head) recently active. - There is a maximum number (16) of permitted simultaneous active senders per receiver. When this limit is reached, the receiver will not advertise anything immediately to a new sender, but instead put it in a PENDING state, and add it to a corresponding queue. At the same time, it will pick the least recently active member, send it an advertisement RECLAIM message, and set this member to state RECLAIMING. - The reclaimee member has to respond with a REMIT message, meaning that it goes back to a send window of ADV_IDLE, and returns its unused advertised blocks beyond that value to the reclaiming member. - When the reclaiming member receives the REMIT message, it unlinks the reclaimee from its active list, resets its state to JOINED, and notes that it is now back at ADV_IDLE advertised blocks to that member. If there are still unread data messages sent out by reclaimee before the REMIT, the member goes into an intermediate state REMITTED, where it stays until the said messages have been consumed. - The returned advertised blocks can now be re-advertised to the pending member, which is now set to state ACTIVE and added to the active member list. - To be proactive, i.e., to minimize the risk that any member will end up in the pending queue, we start reclaiming resources already when the number of active members exceeds 3/4 of the permitted maximum. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/group.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- net/tipc/msg.h | 12 ++++++ 2 files changed, 136 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 8f0eb5d22e8f..7821085a7dd8 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -54,6 +54,10 @@ enum mbr_state { MBR_JOINING, MBR_PUBLISHED, MBR_JOINED, + MBR_PENDING, + MBR_ACTIVE, + MBR_RECLAIMING, + MBR_REMITTED, MBR_LEAVING }; @@ -79,6 +83,9 @@ struct tipc_member { struct tipc_group { struct rb_root members; struct list_head congested; + struct list_head pending; + struct list_head active; + struct list_head reclaiming; struct tipc_nlist dests; struct net *net; int subid; @@ -88,6 +95,8 @@ struct tipc_group { u32 scope; u32 portid; u16 member_cnt; + u16 active_cnt; + u16 max_active; u16 bc_snd_nxt; u16 bc_ackers; bool loopback; @@ -97,12 +106,29 @@ struct tipc_group { static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); +static void tipc_group_decr_active(struct tipc_group *grp, + struct tipc_member *m) +{ + if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING) + grp->active_cnt--; +} + static int tipc_group_rcvbuf_limit(struct tipc_group *grp) { + int max_active, active_pool, idle_pool; int mcnt = grp->member_cnt + 1; + /* Limit simultaneous reception from other members */ + max_active = min(mcnt / 8, 64); + max_active = max(max_active, 16); + grp->max_active = max_active; + + /* Reserve blocks for active and idle members */ + active_pool = max_active * ADV_ACTIVE; + idle_pool = (mcnt - max_active) * ADV_IDLE; + /* Scale to bytes, considering worst-case truesize/msgsize ratio */ - return mcnt * ADV_ACTIVE * FLOWCTL_BLK_SZ * 4; + return (active_pool + idle_pool) * FLOWCTL_BLK_SZ * 4; } u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) @@ -143,6 +169,9 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); INIT_LIST_HEAD(&grp->congested); + INIT_LIST_HEAD(&grp->active); + INIT_LIST_HEAD(&grp->pending); + INIT_LIST_HEAD(&grp->reclaiming); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; @@ -286,6 +315,7 @@ static void tipc_group_delete_member(struct tipc_group *grp, list_del_init(&m->list); list_del_init(&m->congested); + tipc_group_decr_active(grp, m); /* If last member on a node, remove node from dest list */ if (!tipc_group_find_node(grp, m->node)) @@ -378,6 +408,10 @@ bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, return true; if (state == MBR_JOINED && adv == ADV_IDLE) return true; + if (state == MBR_ACTIVE && adv == ADV_ACTIVE) + return true; + if (state == MBR_PENDING && adv == ADV_IDLE) + return true; skb_queue_head_init(&xmitq); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); tipc_node_distr_xmit(grp->net, &xmitq); @@ -523,7 +557,11 @@ drop: void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq) { - struct tipc_member *m; + struct list_head *active = &grp->active; + int max_active = grp->max_active; + int reclaim_limit = max_active * 3 / 4; + int active_cnt = grp->active_cnt; + struct tipc_member *m, *rm; m = tipc_group_find_member(grp, node, port); if (!m) @@ -533,9 +571,41 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, switch (m->state) { case MBR_JOINED: - if (m->advertised <= (ADV_ACTIVE - ADV_UNIT)) + /* Reclaim advertised space from least active member */ + if (!list_empty(active) && active_cnt >= reclaim_limit) { + rm = list_first_entry(active, struct tipc_member, list); + rm->state = MBR_RECLAIMING; + list_move_tail(&rm->list, &grp->reclaiming); + tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); + } + /* If max active, become pending and wait for reclaimed space */ + if (active_cnt >= max_active) { + m->state = MBR_PENDING; + list_add_tail(&m->list, &grp->pending); + break; + } + /* Otherwise become active */ + m->state = MBR_ACTIVE; + list_add_tail(&m->list, &grp->active); + grp->active_cnt++; + /* Fall through */ + case MBR_ACTIVE: + if (!list_is_last(&m->list, &grp->active)) + list_move_tail(&m->list, &grp->active); + if (m->advertised > (ADV_ACTIVE * 3 / 4)) + break; + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + break; + case MBR_REMITTED: + if (m->advertised > ADV_IDLE) + break; + m->state = MBR_JOINED; + if (m->advertised < ADV_IDLE) { + pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + } break; + case MBR_RECLAIMING: case MBR_DISCOVERED: case MBR_JOINING: case MBR_LEAVING: @@ -557,8 +627,10 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, if (!skb) return; - if (m->state == MBR_JOINED) + if (m->state == MBR_ACTIVE) adv = ADV_ACTIVE - m->advertised; + else if (m->state == MBR_JOINED || m->state == MBR_PENDING) + adv = ADV_IDLE - m->advertised; hdr = buf_msg(skb); @@ -573,6 +645,8 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, m->advertised += adv; } else if (mtyp == GRP_ACK_MSG) { msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); + } else if (mtyp == GRP_REMIT_MSG) { + msg_set_grp_remitted(hdr, m->window); } __skb_queue_tail(xmitq, skb); } @@ -583,8 +657,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, { u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); - struct tipc_member *m; + struct tipc_member *m, *pm; struct tipc_msg *ehdr; + u16 remitted, in_flight; if (!grp) return; @@ -626,6 +701,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, /* Wait until WITHDRAW event is received */ if (m->state != MBR_LEAVING) { + tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; return; } @@ -653,6 +729,48 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, *usr_wakeup = true; m->usr_pending = false; return; + case GRP_RECLAIM_MSG: + if (!m) + return; + *usr_wakeup = m->usr_pending; + m->usr_pending = false; + tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); + m->window = ADV_IDLE; + return; + case GRP_REMIT_MSG: + if (!m || m->state != MBR_RECLAIMING) + return; + + list_del_init(&m->list); + grp->active_cnt--; + remitted = msg_grp_remitted(hdr); + + /* Messages preceding the REMIT still in receive queue */ + if (m->advertised > remitted) { + m->state = MBR_REMITTED; + in_flight = m->advertised - remitted; + } + /* All messages preceding the REMIT have been read */ + if (m->advertised <= remitted) { + m->state = MBR_JOINED; + in_flight = 0; + } + /* ..and the REMIT overtaken by more messages => re-advertise */ + if (m->advertised < remitted) + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + + m->advertised = ADV_IDLE + in_flight; + + /* Set oldest pending member to active and advertise */ + if (list_empty(&grp->pending)) + return; + pm = list_first_entry(&grp->pending, struct tipc_member, list); + pm->state = MBR_ACTIVE; + list_move_tail(&pm->list, &grp->active); + grp->active_cnt++; + if (pm->advertised <= (ADV_ACTIVE * 3 / 4)) + tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); + return; default: pr_warn("Received unknown GROUP_PROTO message\n"); } @@ -735,6 +853,7 @@ void tipc_group_member_evt(struct tipc_group *grp, /* Hold back event if more messages might be expected */ if (m->state != MBR_LEAVING && node_up) { m->event_msg = skb; + tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; } else { if (node_up) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 52c6a2e01995..cedf811317fb 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -548,6 +548,8 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) #define GRP_LEAVE_MSG 1 #define GRP_ADV_MSG 2 #define GRP_ACK_MSG 3 +#define GRP_RECLAIM_MSG 4 +#define GRP_REMIT_MSG 5 /* * Word 1 @@ -850,6 +852,16 @@ static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) msg_set_bits(m, 9, 16, 0xffff, n); } +static inline u16 msg_grp_remitted(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + /* Word 10 */ static inline u16 msg_grp_evt(struct tipc_msg *m) -- cgit v1.2.3 From 4e8b86c062695454df0b76f3fee4fab8dc4bb716 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Thu, 7 Sep 2017 04:00:06 -0700 Subject: mqprio: Introduce new hardware offload mode and shaper in mqprio The offload types currently supported in mqprio are 0 (no offload) and 1 (offload only TCs) by setting these values for the 'hw' option. If offloads are supported by setting the 'hw' option to 1, the default offload mode is 'dcb' where only the TC values are offloaded to the device. This patch introduces a new hardware offload mode called 'channel' with 'hw' set to 1 in mqprio which makes full use of the mqprio options, the TCs, the queue configurations and the QoS parameters for the TCs. This is achieved through a new netlink attribute for the 'mode' option which takes values such as 'dcb' (default) and 'channel'. The 'channel' mode also supports QoS attributes for traffic class such as minimum and maximum values for bandwidth rate limits. This patch enables configuring additional HW shaper attributes associated with a traffic class. Currently the shaper for bandwidth rate limiting is supported which takes options such as minimum and maximum bandwidth rates and are offloaded to the hardware in the 'channel' mode. The min and max limits for bandwidth rates are provided by the user along with the TCs and the queue configurations when creating the mqprio qdisc. The interface can be extended to support new HW shapers in future through the 'shaper' attribute. Introduces a new data structure 'tc_mqprio_qopt_offload' for offloading mqprio queue options and use this to be shared between the kernel and device driver. This contains a copy of the existing data structure for mqprio queue options. This new data structure can be extended when adding new attributes for traffic class such as mode, shaper, shaper parameters (bandwidth rate limits). The existing data structure for mqprio queue options will be shared between the kernel and userspace. Example: queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\ min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit To dump the bandwidth rates: qdisc mqprio 804a: root tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 queues:(0:3) (4:7) mode:channel shaper:bw_rlimit min_rate:1Gbit 2Gbit max_rate:4Gbit 5Gbit Signed-off-by: Amritha Nambiar Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- net/sched/sch_mqprio.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 174 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 6bcdfe6e7b63..f1ae9be83934 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -18,10 +18,16 @@ #include #include #include +#include struct mqprio_sched { struct Qdisc **qdiscs; + u16 mode; + u16 shaper; int hw_offload; + u32 flags; + u64 min_rate[TC_QOPT_MAX_QUEUE]; + u64 max_rate[TC_QOPT_MAX_QUEUE]; }; static void mqprio_destroy(struct Qdisc *sch) @@ -39,9 +45,17 @@ static void mqprio_destroy(struct Qdisc *sch) } if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) { - struct tc_mqprio_qopt mqprio = {}; + struct tc_mqprio_qopt_offload mqprio = { { 0 } }; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &mqprio); + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + case TC_MQPRIO_MODE_CHANNEL: + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, + &mqprio); + break; + default: + return; + } } else { netdev_set_num_tc(dev, 0); } @@ -97,6 +111,26 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) return 0; } +static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = { + [TCA_MQPRIO_MODE] = { .len = sizeof(u16) }, + [TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) }, + [TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED }, + [TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED }, +}; + +static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy, int len) +{ + int nested_len = nla_len(nla) - NLA_ALIGN(len); + + if (nested_len >= nla_attr_size(0)) + return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), + nested_len, policy, NULL); + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + return 0; +} + static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) { struct net_device *dev = qdisc_dev(sch); @@ -105,6 +139,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; + struct nlattr *tb[TCA_MQPRIO_MAX + 1]; + struct nlattr *attr; + int rem; + int len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); @@ -122,6 +160,58 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) if (mqprio_parse_opt(dev, qopt)) return -EINVAL; + if (len > 0) { + err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, + sizeof(*qopt)); + if (err < 0) + return err; + + if (!qopt->hw) + return -EINVAL; + + if (tb[TCA_MQPRIO_MODE]) { + priv->flags |= TC_MQPRIO_F_MODE; + priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); + } + + if (tb[TCA_MQPRIO_SHAPER]) { + priv->flags |= TC_MQPRIO_F_SHAPER; + priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); + } + + if (tb[TCA_MQPRIO_MIN_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->min_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MIN_RATE; + } + + if (tb[TCA_MQPRIO_MAX_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->max_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MAX_RATE; + } + } + /* pre-allocate qdisc, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); @@ -146,14 +236,36 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) * supplied and verified mapping */ if (qopt->hw) { - struct tc_mqprio_qopt mqprio = *qopt; + struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + if (priv->shaper != TC_MQPRIO_SHAPER_DCB) + return -EINVAL; + break; + case TC_MQPRIO_MODE_CHANNEL: + mqprio.flags = priv->flags; + if (priv->flags & TC_MQPRIO_F_MODE) + mqprio.mode = priv->mode; + if (priv->flags & TC_MQPRIO_F_SHAPER) + mqprio.shaper = priv->shaper; + if (priv->flags & TC_MQPRIO_F_MIN_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.min_rate[i] = priv->min_rate[i]; + if (priv->flags & TC_MQPRIO_F_MAX_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.max_rate[i] = priv->max_rate[i]; + break; + default: + return -EINVAL; + } + err = dev->netdev_ops->ndo_setup_tc(dev, + TC_SETUP_MQPRIO, &mqprio); if (err) return err; - priv->hw_offload = mqprio.hw; + priv->hw_offload = mqprio.qopt.hw; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) @@ -223,11 +335,51 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, return 0; } +static int dump_rates(struct mqprio_sched *priv, + struct tc_mqprio_qopt *opt, struct sk_buff *skb) +{ + struct nlattr *nest; + int i; + + if (priv->flags & TC_MQPRIO_F_MIN_RATE) { + nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64); + if (!nest) + goto nla_put_failure; + + for (i = 0; i < opt->num_tc; i++) { + if (nla_put(skb, TCA_MQPRIO_MIN_RATE64, + sizeof(priv->min_rate[i]), + &priv->min_rate[i])) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + + if (priv->flags & TC_MQPRIO_F_MAX_RATE) { + nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64); + if (!nest) + goto nla_put_failure; + + for (i = 0; i < opt->num_tc; i++) { + if (nla_put(skb, TCA_MQPRIO_MAX_RATE64, + sizeof(priv->max_rate[i]), + &priv->max_rate[i])) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); - unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; unsigned int i; @@ -258,12 +410,25 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.offset[i] = dev->tc_to_txq[i].offset; } - if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt)) + goto nla_put_failure; + + if ((priv->flags & TC_MQPRIO_F_MODE) && + nla_put_u16(skb, TCA_MQPRIO_MODE, priv->mode)) + goto nla_put_failure; + + if ((priv->flags & TC_MQPRIO_F_SHAPER) && + nla_put_u16(skb, TCA_MQPRIO_SHAPER, priv->shaper)) + goto nla_put_failure; + + if ((priv->flags & TC_MQPRIO_F_MIN_RATE || + priv->flags & TC_MQPRIO_F_MAX_RATE) && + (dump_rates(priv, &opt, skb) != 0)) goto nla_put_failure; - return skb->len; + return nla_nest_end(skb, nla); nla_put_failure: - nlmsg_trim(skb, b); + nlmsg_trim(skb, nla); return -1; } -- cgit v1.2.3 From 258bbb1b0e594ad5f5652cb526b3c63e6a7fad3d Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 12 Oct 2017 16:12:37 +0200 Subject: icmp: don't fail on fragment reassembly time exceeded The ICMP implementation currently replies to an ICMP time exceeded message (type 11) with an ICMP host unreachable message (type 3, code 1). However, time exceeded messages can either represent "time to live exceeded in transit" (code 0) or "fragment reassembly time exceeded" (code 1). Unconditionally replying to "fragment reassembly time exceeded" with host unreachable messages might cause unjustified connection resets which are now easily triggered as UFO has been removed, because, in turn, sending large buffers triggers IP fragmentation. The issue can be easily reproduced by running a lot of UDP streams which is likely to trigger IP fragmentation: # start netserver in the test namespace ip netns add test ip netns exec test netserver # create a VETH pair ip link add name veth0 type veth peer name veth0 netns test ip link set veth0 up ip -n test link set veth0 up for i in $(seq 20 29); do # assign addresses to both ends ip addr add dev veth0 192.168.$i.1/24 ip -n test addr add dev veth0 192.168.$i.2/24 # start the traffic netperf -L 192.168.$i.1 -H 192.168.$i.2 -t UDP_STREAM -l 0 & done # wait send_data: data send error: No route to host (errno 113) netperf: send_omni: send_data failed: No route to host We need to differentiate instead: if fragment reassembly time exceeded is reported, we need to silently drop the packet, if time to live exceeded is reported, maintain the current behaviour. In both cases increment the related error count "icmpInTimeExcds". While at it, fix a typo in a comment, and convert the if statement into a switch to mate it more readable. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 681e33998e03..3c1570d3e22f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -782,7 +782,7 @@ static bool icmp_tag_validation(int proto) } /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. */ @@ -810,7 +810,8 @@ static bool icmp_unreach(struct sk_buff *skb) if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; - if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->type) { + case ICMP_DEST_UNREACH: switch (icmph->code & 15) { case ICMP_NET_UNREACH: case ICMP_HOST_UNREACH: @@ -846,8 +847,16 @@ static bool icmp_unreach(struct sk_buff *skb) } if (icmph->code > NR_ICMP_UNREACH) goto out; - } else if (icmph->type == ICMP_PARAMETERPROB) + break; + case ICMP_PARAMETERPROB: info = ntohl(icmph->un.gateway) >> 24; + break; + case ICMP_TIME_EXCEEDED: + __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS); + if (icmph->code == ICMP_EXC_FRAGTIME) + goto out; + break; + } /* * Throw it at our lower layers -- cgit v1.2.3 From 841f4f24053acad69240c6ab7427a1d24bc29491 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 13 Oct 2017 14:18:09 -0400 Subject: net: dsa: remove .set_addr Now that there is no user for the .set_addr function, remove it from DSA. If a switch supports this feature (like mv88e6xxx), the implementation can be done in the driver setup. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 6 ------ net/dsa/legacy.c | 6 ------ 2 files changed, 12 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 54ed054777bd..6ac9e11d385c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -336,12 +336,6 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (err) return err; - if (ds->ops->set_addr) { - err = ds->ops->set_addr(ds, dst->cpu_dp->netdev->dev_addr); - if (err < 0) - return err; - } - if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 19ff6e0a21dc..b0fefbffe082 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -172,12 +172,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, if (ret) return ret; - if (ops->set_addr) { - ret = ops->set_addr(ds, master->dev_addr); - if (ret < 0) - return ret; - } - if (!ds->slave_mii_bus && ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) -- cgit v1.2.3 From 657875944726af8f881df7d431afbbe8f50ad4d7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 13 Oct 2017 12:58:13 -0700 Subject: net_sched: fix a compile warning in act_ife MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apparently ife_meta_id2name() is only called when CONFIG_MODULES is defined. This fixes: net/sched/act_ife.c:251:20: warning: ‘ife_meta_id2name’ defined but not used [-Wunused-function] static const char *ife_meta_id2name(u32 metaid) ^~~~~~~~~~~~~~~~ Fixes: d3f24ba895f0 ("net sched actions: fix module auto-loading") Cc: Roman Mashak Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_ife.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 252ee7d8c731..3007cb1310ea 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -248,6 +248,7 @@ static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) return ret; } +#ifdef CONFIG_MODULES static const char *ife_meta_id2name(u32 metaid) { switch (metaid) { @@ -261,6 +262,7 @@ static const char *ife_meta_id2name(u32 metaid) return "unknown"; } } +#endif /* called when adding new meta information * under ife->tcf_lock for existing action -- cgit v1.2.3 From e086101b150ae8e99e54ab26101ef3835fa9f48d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 13 Oct 2017 13:03:16 -0700 Subject: tcp: add a tracepoint for tcp retransmission We need a real-time notification for tcp retransmission for monitoring. Of course we could use ftrace to dynamically instrument this kernel function too, however we can't retrieve the connection information at the same time, for example perf-tools [1] reads /proc/net/tcp for socket details, which is slow when we have a lots of connections. Therefore, this patch adds a tracepoint for __tcp_retransmit_skb() and exposes src/dst IP addresses and ports of the connection. This also makes it easier to integrate into perf. Note, I expose both IPv4 and IPv6 addresses at the same time: for a IPv4 socket, v4 mapped address is used as IPv6 addresses, for a IPv6 socket, LOOPBACK4_IPV6 is already filled by kernel. Also, add sk and skb pointers as they are useful for BPF. 1. https://github.com/brendangregg/perf-tools/blob/master/net/tcpretrans Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Hannes Frederic Sowa Cc: Brendan Gregg Cc: Neal Cardwell Signed-off-by: Cong Wang Acked-by: Alexei Starovoitov Acked-by: Brendan Gregg Signed-off-by: David S. Miller --- net/core/net-traces.c | 1 + net/ipv4/tcp_output.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 1132820c8e62..f4e4fa2db505 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 696b0a168f16..6c74f2a39778 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -42,6 +42,8 @@ #include #include +#include + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse __read_mostly = 1; @@ -2875,6 +2877,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } -- cgit v1.2.3 From 8f66b1a529047a972cb9602a919c53a95f3d7a2b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Oct 2017 12:03:26 -0400 Subject: xprtrdma: Don't defer fencing an async RPC's chunks In current kernels, waiting in xprt_release appears to be safe to do. I had erroneously believed that for ASYNC RPCs, waiting of any kind in xprt_release->xprt_rdma_free would result in deadlock. I've done injection testing and consulted with Trond to confirm that waiting in the RPC release path is safe. For the very few times where RPC resources haven't yet been released earlier by the reply handler, it is safe to wait synchronously in xprt_rdma_free for invalidation rather than defering it to MR recovery. Note: When the QP is error state, posting a LocalInvalidate should flush and mark the MR as bad. There is no way the remote HCA can access that MR via a QP in error state, so it is effectively already inaccessible and thus safe for the Upper Layer to access. The next time the MR is used it should be recognized and cleaned up properly by frwr_op_map. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c84e2b644e13..8cf5ccfe180d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -686,7 +686,7 @@ xprt_rdma_free(struct rpc_task *task) dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); if (!list_empty(&req->rl_registered)) - ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); + ia->ri_ops->ro_unmap_sync(r_xprt, &req->rl_registered); rpcrdma_unmap_sges(ia, req); rpcrdma_buffer_put(req); } -- cgit v1.2.3 From 4ce6c04c2acef91a10d65a3bcb622654bb01d930 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Oct 2017 12:03:34 -0400 Subject: xprtrdma: Use ro_unmap_sync in xprt_rdma_send_request The "safe" version of ro_unmap is used here to avoid waiting unnecessarily. However: - It is safe to wait. After all, we have to wait anyway when using FMR to register memory. - This case is rare: it occurs only after a reconnect. By switching this call site to ro_unmap_sync, the final use of ro_unmap_safe is removed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8cf5ccfe180d..eb46d2479b09 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -728,7 +728,8 @@ xprt_rdma_send_request(struct rpc_task *task) /* On retransmit, remove any previously registered chunks */ if (unlikely(!list_empty(&req->rl_registered))) - r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) -- cgit v1.2.3 From 2b4f8923ecaafc0c25ee56bc17ea9256d12b747c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Oct 2017 12:03:42 -0400 Subject: xprtrdma: Remove ro_unmap_safe Clean up: There are no remaining callers of this method. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 19 ------------------- net/sunrpc/xprtrdma/frwr_ops.c | 19 ------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 -- 3 files changed, 40 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 6c7151341194..30bf713080f1 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -305,28 +305,9 @@ out_reset: } } -/* Use a slow, safe mechanism to invalidate all memory regions - * that were registered for "req". - */ -static void -fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - bool sync) -{ - struct rpcrdma_mw *mw; - - while (!list_empty(&req->rl_registered)) { - mw = rpcrdma_pop_mw(&req->rl_registered); - if (sync) - fmr_op_recover_mr(mw); - else - rpcrdma_defer_mr_recovery(mw); - } -} - const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_unmap_safe = fmr_op_unmap_safe, .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index df062e086bdb..3053fb0f5cb3 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -558,28 +558,9 @@ reset_mrs: goto unmap; } -/* Use a slow, safe mechanism to invalidate all memory regions - * that were registered for "req". - */ -static void -frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - bool sync) -{ - struct rpcrdma_mw *mw; - - while (!list_empty(&req->rl_registered)) { - mw = rpcrdma_pop_mw(&req->rl_registered); - if (sync) - frwr_op_recover_mr(mw); - else - rpcrdma_defer_mr_recovery(mw); - } -} - const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_unmap_safe = frwr_op_unmap_safe, .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index e26a97d2f922..74e017477e72 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -473,8 +473,6 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_unmap_safe)(struct rpcrdma_xprt *, - struct rpcrdma_req *, bool); void (*ro_recover_mr)(struct rpcrdma_mw *); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, -- cgit v1.2.3 From 32302902ff093891d8e64439cbb8ceae83e21ef8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 12 Oct 2017 11:38:45 -0700 Subject: mqprio: Reserve last 32 classid values for HW traffic classes and misc IDs This patch makes a slight tweak to mqprio in order to bring the classid values used back in line with what is used for mq. The general idea is to reserve values :ffe0 - :ffef to identify hardware traffic classes normally reported via dev->num_tc. By doing this we can maintain a consistent behavior with mq for classid where :1 - :ffdf will represent a physical qdisc mapped onto a Tx queue represented by classid - 1, and the traffic classes will be mapped onto a known subset of classid values reserved for our virtual qdiscs. Note I reserved the range from :fff0 - :ffff since this way we might be able to reuse these classid values with clsact and ingress which would mean that for mq, mqprio, ingress, and clsact we should be able to maintain a similar classid layout. Signed-off-by: Alexander Duyck Tested-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- net/sched/sch_mqprio.c | 79 +++++++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index f1ae9be83934..cae91b4b08a6 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -153,6 +153,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; + /* make certain can allocate enough classids to handle queues */ + if (dev->num_tx_queues >= TC_H_MIN_PRIORITY) + return -ENOMEM; + if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; @@ -305,7 +309,7 @@ static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); - unsigned long ntx = cl - 1 - netdev_get_num_tc(dev); + unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; @@ -447,38 +451,35 @@ static unsigned long mqprio_find(struct Qdisc *sch, u32 classid) struct net_device *dev = qdisc_dev(sch); unsigned int ntx = TC_H_MIN(classid); - if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev)) - return 0; - return ntx; + /* There are essentially two regions here that have valid classid + * values. The first region will have a classid value of 1 through + * num_tx_queues. All of these are backed by actual Qdiscs. + */ + if (ntx < TC_H_MIN_PRIORITY) + return (ntx <= dev->num_tx_queues) ? ntx : 0; + + /* The second region represents the hardware traffic classes. These + * are represented by classid values of TC_H_MIN_PRIORITY through + * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1 + */ + return ((ntx - TC_H_MIN_PRIORITY) < netdev_get_num_tc(dev)) ? ntx : 0; } static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { - struct net_device *dev = qdisc_dev(sch); + if (cl < TC_H_MIN_PRIORITY) { + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + struct net_device *dev = qdisc_dev(sch); + int tc = netdev_txq_to_tc(dev, cl - 1); - if (cl <= netdev_get_num_tc(dev)) { + tcm->tcm_parent = (tc < 0) ? 0 : + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(tc + TC_H_MIN_PRIORITY)); + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + } else { tcm->tcm_parent = TC_H_ROOT; tcm->tcm_info = 0; - } else { - int i; - struct netdev_queue *dev_queue; - - dev_queue = mqprio_queue_get(sch, cl); - tcm->tcm_parent = 0; - for (i = 0; i < netdev_get_num_tc(dev); i++) { - struct netdev_tc_txq tc = dev->tc_to_txq[i]; - int q_idx = cl - netdev_get_num_tc(dev); - - if (q_idx > tc.offset && - q_idx <= tc.offset + tc.count) { - tcm->tcm_parent = - TC_H_MAKE(TC_H_MAJ(sch->handle), - TC_H_MIN(i + 1)); - break; - } - } - tcm->tcm_info = dev_queue->qdisc_sleeping->handle; } tcm->tcm_handle |= TC_H_MIN(cl); return 0; @@ -489,15 +490,14 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, __releases(d->lock) __acquires(d->lock) { - struct net_device *dev = qdisc_dev(sch); - - if (cl <= netdev_get_num_tc(dev)) { + if (cl >= TC_H_MIN_PRIORITY) { int i; __u32 qlen = 0; struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; - struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1]; + struct net_device *dev = qdisc_dev(sch); + struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we @@ -550,12 +550,25 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) /* Walk hierarchy with a virtual class per tc */ arg->count = arg->skip; - for (ntx = arg->skip; - ntx < dev->num_tx_queues + netdev_get_num_tc(dev); - ntx++) { + for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) { + if (arg->fn(sch, ntx + TC_H_MIN_PRIORITY, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + + /* Pad the values and skip over unused traffic classes */ + if (ntx < TC_MAX_QUEUE) { + arg->count = TC_MAX_QUEUE; + ntx = TC_MAX_QUEUE; + } + + /* Reset offset, sort out remaining per-queue qdiscs */ + for (ntx -= TC_MAX_QUEUE; ntx < dev->num_tx_queues; ntx++) { if (arg->fn(sch, ntx + 1, arg) < 0) { arg->stop = 1; - break; + return; } arg->count++; } -- cgit v1.2.3 From 69d78ef25c7b0058674145500efb12255738ba8a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:00:57 +0200 Subject: net: sched: store Qdisc pointer in struct block Prepare for removal of tp->q and store Qdisc pointer in the block structure. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 3 ++- net/sched/sch_atm.c | 4 ++-- net/sched/sch_cbq.c | 2 +- net/sched/sch_drr.c | 2 +- net/sched/sch_dsmark.c | 2 +- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_hfsc.c | 4 ++-- net/sched/sch_htb.c | 4 ++-- net/sched/sch_ingress.c | 6 +++--- net/sched/sch_multiq.c | 2 +- net/sched/sch_prio.c | 2 +- net/sched/sch_qfq.c | 2 +- net/sched/sch_sfb.c | 2 +- net/sched/sch_sfq.c | 2 +- 14 files changed, 20 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2977b8a90851..f7d3f1f539b7 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -241,7 +241,7 @@ tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain, } int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain) + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) { struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); struct tcf_chain *chain; @@ -257,6 +257,7 @@ int tcf_block_get(struct tcf_block **p_block, goto err_chain_create; } tcf_chain_filter_chain_ptr_set(chain, p_filter_chain); + block->q = q; *p_block = block; return 0; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index c5fcdf1a58a0..2dbd249c0b2f 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -281,7 +281,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, goto err_out; } - error = tcf_block_get(&flow->block, &flow->filter_list); + error = tcf_block_get(&flow->block, &flow->filter_list, sch); if (error) { kfree(flow); goto err_out; @@ -546,7 +546,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - err = tcf_block_get(&p->link.block, &p->link.filter_list); + err = tcf_block_get(&p->link.block, &p->link.filter_list, sch); if (err) return err; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index dcef97fa8047..c3b92d62190e 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1566,7 +1566,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (cl == NULL) goto failure; - err = tcf_block_get(&cl->block, &cl->filter_list); + err = tcf_block_get(&cl->block, &cl->filter_list, sch); if (err) { kfree(cl); return err; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 2d0e8d4bdc29..753dc7a77b60 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -412,7 +412,7 @@ static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct drr_sched *q = qdisc_priv(sch); int err; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; err = qdisc_class_hash_init(&q->clhash); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 2836c80c7aa5..fb4fb71c68cf 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -344,7 +344,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) goto errout; - err = tcf_block_get(&p->block, &p->filter_list); + err = tcf_block_get(&p->block, &p->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index de3b57ceca7b..3c40edeff1e8 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -481,7 +481,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) return err; } - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3f88b75488b0..a692184bd333 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1033,7 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; - err = tcf_block_get(&cl->block, &cl->filter_list); + err = tcf_block_get(&cl->block, &cl->filter_list, sch); if (err) { kfree(cl); return err; @@ -1405,7 +1405,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) return err; q->eligible = RB_ROOT; - err = tcf_block_get(&q->root.block, &q->root.filter_list); + err = tcf_block_get(&q->root.block, &q->root.filter_list, sch); if (err) return err; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index c6d7ae81b41f..57be73c0e1d2 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1030,7 +1030,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; @@ -1393,7 +1393,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; - err = tcf_block_get(&cl->block, &cl->filter_list); + err = tcf_block_get(&cl->block, &cl->filter_list, sch); if (err) { kfree(cl); goto failure; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 44de4ee51ce9..9ccc1b89b0d9 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -59,7 +59,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; - err = tcf_block_get(&q->block, &dev->ingress_cl_list); + err = tcf_block_get(&q->block, &dev->ingress_cl_list, sch); if (err) return err; @@ -153,11 +153,11 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; - err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list); + err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list, sch); if (err) return err; - err = tcf_block_get(&q->egress_block, &dev->egress_cl_list); + err = tcf_block_get(&q->egress_block, &dev->egress_cl_list, sch); if (err) return err; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index ff4fc3e0facd..31e0a284eeff 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -245,7 +245,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 2dd6c68ae91e..95fad348c8d7 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -212,7 +212,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 6ddfd4991108..8694c7b6d2b1 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1419,7 +1419,7 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) int i, j, err; u32 max_cl_shift, maxbudg_shift, max_classes; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index cc39e170b4aa..487d375f5a06 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -553,7 +553,7 @@ static int sfb_init(struct Qdisc *sch, struct nlattr *opt) struct sfb_sched_data *q = qdisc_priv(sch); int err; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 74ea863b8240..123a53af2506 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -725,7 +725,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, (unsigned long)sch); - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; -- cgit v1.2.3 From 855319becbcffec6988a4e781a861b69a71c5b58 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:00:58 +0200 Subject: net: sched: store net pointer in block and introduce qdisc_net helper Store net pointer in the block structure. Along the way, introduce qdisc_net helper which allows to easily obtain net pointer for qdisc instance. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f7d3f1f539b7..856003caa3bb 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -257,6 +257,7 @@ int tcf_block_get(struct tcf_block **p_block, goto err_chain_create; } tcf_chain_filter_chain_ptr_set(chain, p_filter_chain); + block->net = qdisc_net(q); block->q = q; *p_block = block; return 0; -- cgit v1.2.3 From c1954561cd262b8adf7908c5552fe9ad99f82f81 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:01:01 +0200 Subject: net: sched: ematch: obtain net pointer from blocks Instead of using tp->q, use block to get the net pointer. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/ematch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 03b677bc0700..1331a4c2d8ff 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -178,7 +178,7 @@ static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); - struct net *net = dev_net(qdisc_dev(tp->q)); + struct net *net = tp->chain->block->net; if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; -- cgit v1.2.3 From 7fa9d974f3c2a016b9accb18f4ee2ed2a738585c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:01:02 +0200 Subject: net: sched: cls_u32: use block instead of q in tc_u_common tc_u_common is now per-q. With blocks, it has to be converted to be per-block. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 094d224411a9..b6d46065f661 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -93,7 +93,7 @@ struct tc_u_hnode { struct tc_u_common { struct tc_u_hnode __rcu *hlist; - struct Qdisc *q; + struct tcf_block *block; int refcnt; struct idr handle_idr; struct hlist_node hnode; @@ -335,11 +335,7 @@ static struct hlist_head *tc_u_common_hash; static unsigned int tc_u_hash(const struct tcf_proto *tp) { - struct net_device *dev = tp->q->dev_queue->dev; - u32 qhandle = tp->q->handle; - int ifindex = dev->ifindex; - - return hash_64((u64)ifindex << 32 | qhandle, U32_HASH_SHIFT); + return hash_64((u64) tp->chain->block, U32_HASH_SHIFT); } static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) @@ -349,7 +345,7 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) h = tc_u_hash(tp); hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) { - if (tc->q == tp->q) + if (tc->block == tp->chain->block) return tc; } return NULL; @@ -378,7 +374,7 @@ static int u32_init(struct tcf_proto *tp) kfree(root_ht); return -ENOBUFS; } - tp_c->q = tp->q; + tp_c->block = tp->chain->block; INIT_HLIST_NODE(&tp_c->hnode); idr_init(&tp_c->handle_idr); -- cgit v1.2.3 From 1abf272022cf1d18469405f47b4ec49c6a3125db Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:01:03 +0200 Subject: net: sched: tcindex, fw, flow: use tcf_block_q helper to get struct Qdisc Use helper to get q pointer per block. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flow.c | 7 +++++-- net/sched/cls_fw.c | 5 ++++- net/sched/cls_tcindex.c | 5 ++++- 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 2a3a60ec5b86..f3be6667a253 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -491,8 +491,11 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } - if (TC_H_MAJ(baseclass) == 0) - baseclass = TC_H_MAKE(tp->q->handle, baseclass); + if (TC_H_MAJ(baseclass) == 0) { + struct Qdisc *q = tcf_block_q(tp->chain->block); + + baseclass = TC_H_MAKE(q->handle, baseclass); + } if (TC_H_MIN(baseclass) == 0) baseclass = TC_H_MAKE(baseclass, 1); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 941245ad07fd..aa1e1f3bd20c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -28,6 +28,7 @@ #include #include #include +#include #define HTSIZE 256 @@ -83,9 +84,11 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, } } } else { + struct Qdisc *q = tcf_block_q(tp->chain->block); + /* Old method: classify the packet using its skb mark. */ if (id && (TC_H_MAJ(id) == 0 || - !(TC_H_MAJ(id ^ tp->q->handle)))) { + !(TC_H_MAJ(id ^ q->handle)))) { res->classid = id; res->class = 0; return 0; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 14a7e08b2fa9..d732b5474a4d 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -13,6 +13,7 @@ #include #include #include +#include /* * Passing parameters to the root seems to be done more awkwardly than really @@ -90,9 +91,11 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = tcindex_lookup(p, key); if (!f) { + struct Qdisc *q = tcf_block_q(tp->chain->block); + if (!p->fall_through) return -1; - res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key); + res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key); res->class = 0; pr_debug("alg 0x%x\n", res->classid); return 0; -- cgit v1.2.3 From 74e3be6021d22df2ffcb691eae1affeb2bd0128e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:01:04 +0200 Subject: net: sched: use tcf_block_q helper to get q pointer for sch_tree_lock Use tcf_block_q helper to get q pointer to be used for direct call of sch_tree_lock/unlock instead of tcf_tree_lock/unlock. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index aa82116ed10c..a9ac912f1d67 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1661,9 +1661,11 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct tcf_bind_args *a = (void *)arg; if (tp->ops->bind_class) { - tcf_tree_lock(tp); + struct Qdisc *q = tcf_block_q(tp->chain->block); + + sch_tree_lock(q); tp->ops->bind_class(n, a->classid, a->cl); - tcf_tree_unlock(tp); + sch_tree_unlock(q); } return 0; } -- cgit v1.2.3 From a10fa20101ae48fed2102f8e84d98f5aac2e5c40 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:01:05 +0200 Subject: net: sched: propagate q and parent from caller down to tcf_fill_node The callers have this info, they will pass it down to tcf_fill_node. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 55 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 856003caa3bb..2e8e87fd9d97 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -420,8 +420,8 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, } static int tcf_fill_node(struct net *net, struct sk_buff *skb, - struct tcf_proto *tp, void *fh, u32 portid, - u32 seq, u16 flags, int event) + struct tcf_proto *tp, struct Qdisc *q, u32 parent, + void *fh, u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -434,8 +434,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; - tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex; - tcm->tcm_parent = tp->classid; + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; + tcm->tcm_parent = parent; tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; @@ -458,6 +458,7 @@ nla_put_failure: static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, + struct Qdisc *q, u32 parent, void *fh, int event, bool unicast) { struct sk_buff *skb; @@ -467,7 +468,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, + if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event) <= 0) { kfree_skb(skb); return -EINVAL; @@ -482,6 +483,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, + struct Qdisc *q, u32 parent, void *fh, bool unicast, bool *last) { struct sk_buff *skb; @@ -492,7 +494,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, + if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) { kfree_skb(skb); return -EINVAL; @@ -512,6 +514,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, } static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, + struct Qdisc *q, u32 parent, struct nlmsghdr *n, struct tcf_chain *chain, int event) { @@ -519,7 +522,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, for (tp = rtnl_dereference(chain->filter_chain); tp; tp = rtnl_dereference(tp->next)) - tfilter_notify(net, oskb, n, tp, 0, event, false); + tfilter_notify(net, oskb, n, tp, q, parent, 0, event, false); } /* Add/change/delete/get a filter node */ @@ -638,7 +641,8 @@ replay: } if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { - tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); + tfilter_notify_chain(net, skb, q, parent, n, + chain, RTM_DELTFILTER); tcf_chain_flush(chain); err = 0; goto errout; @@ -685,7 +689,7 @@ replay: if (!fh) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { tcf_chain_tp_remove(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, fh, + tfilter_notify(net, skb, n, tp, q, parent, fh, RTM_DELTFILTER, false); tcf_proto_destroy(tp); err = 0; @@ -710,8 +714,8 @@ replay: } break; case RTM_DELTFILTER: - err = tfilter_del_notify(net, skb, n, tp, fh, false, - &last); + err = tfilter_del_notify(net, skb, n, tp, q, parent, + fh, false, &last); if (err) goto errout; if (last) { @@ -720,7 +724,7 @@ replay: } goto errout; case RTM_GETTFILTER: - err = tfilter_notify(net, skb, n, tp, fh, + err = tfilter_notify(net, skb, n, tp, q, parent, fh, RTM_NEWTFILTER, true); goto errout; default: @@ -734,7 +738,8 @@ replay: if (err == 0) { if (tp_created) tcf_chain_tp_insert(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); + tfilter_notify(net, skb, n, tp, q, parent, fh, + RTM_NEWTFILTER, false); } else { if (tp_created) tcf_proto_destroy(tp); @@ -753,6 +758,8 @@ struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; struct netlink_callback *cb; + struct Qdisc *q; + u32 parent; }; static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -760,13 +767,14 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct tcf_dump_args *a = (void *)arg; struct net *net = sock_net(a->skb->sk); - return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, + return tcf_fill_node(net, a->skb, tp, a->q, a->parent, + n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); } -static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, - struct netlink_callback *cb, +static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, + struct sk_buff *skb, struct netlink_callback *cb, long index_start, long *p_index) { struct net *net = sock_net(skb->sk); @@ -788,7 +796,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, 0, + if (tcf_fill_node(net, skb, tp, q, parent, 0, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) @@ -801,6 +809,8 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, arg.w.fn = tcf_node_dump; arg.skb = skb; arg.cb = cb; + arg.q = q; + arg.parent = parent; arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; @@ -826,6 +836,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) const struct Qdisc_class_ops *cops; long index_start; long index; + u32 parent; int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) @@ -839,10 +850,13 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (!dev) return skb->len; - if (!tcm->tcm_parent) + parent = tcm->tcm_parent; + if (!parent) { q = dev->qdisc; - else + parent = q->handle; + } else { q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + } if (!q) goto out; cops = q->ops->cl_ops; @@ -866,7 +880,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; - if (!tcf_chain_dump(chain, skb, cb, index_start, &index)) + if (!tcf_chain_dump(chain, q, parent, skb, cb, + index_start, &index)) break; } -- cgit v1.2.3 From 0e80193bd8c1d97654f1a2f45934e7372e9a512e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 13 Oct 2017 15:01:08 -0700 Subject: ipv6: check fn before doing FIB6_SUBTREE(fn) In fib6_locate(), we need to first make sure fn is not NULL before doing FIB6_SUBTREE(fn) to avoid crash. This fixes the following static checker warning: net/ipv6/ip6_fib.c:1462 fib6_locate() warn: variable dereferenced before check 'fn' (see line 1459) net/ipv6/ip6_fib.c 1458 if (src_len) { 1459 struct fib6_node *subtree = FIB6_SUBTREE(fn); ^^^^^^^^^^^^^^^^ We shifted this dereference 1460 1461 WARN_ON(saddr == NULL); 1462 if (fn && subtree) ^^ before the check for NULL. 1463 fn = fib6_locate_1(subtree, saddr, src_len, 1464 offsetof(struct rt6_info, rt6i_src) Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Dan Carpenter Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c2ecd5ec638a..548af48212fc 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1456,13 +1456,16 @@ struct fib6_node *fib6_locate(struct fib6_node *root, #ifdef CONFIG_IPV6_SUBTREES if (src_len) { - struct fib6_node *subtree = FIB6_SUBTREE(fn); - WARN_ON(saddr == NULL); - if (fn && subtree) - fn = fib6_locate_1(subtree, saddr, src_len, + if (fn) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree) { + fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct rt6_info, rt6i_src), exact_match); + } + } } #endif -- cgit v1.2.3 From 0da4af00b2ed3dbe46788623a696c4169447eadc Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 13 Oct 2017 15:08:07 -0700 Subject: ipv6: only update __use and lastusetime once per jiffy at most In order to not dirty the cacheline too often, we try to only update dst->__use and dst->lastusetime at most once per jiffy. As dst->lastusetime is only used by ipv6 garbage collector, it should be good enough time resolution. And __use is only used in ipv6_route_seq_show() to show how many times a dst has been used. And as __use is not atomic_t right now, it does not show the precise number of usage times anyway. So we think it should be OK to only update it at most once per jiffy. According to my latest syn flood test on a machine with intel Xeon 6th gen processor and 2 10G mlx nics bonded together, each with 8 rx queues on 2 NUMA nodes: With this patch, the packet process rate increases from ~3.49Mpps to ~3.75Mpps with a 7% increase rate. Note: dst_use() is being renamed to dst_hold_and_use() to better specify the purpose of the function. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/decnet/dn_route.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 0bd3afd01dd2..bff5ab88cdbb 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -338,7 +338,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth); - dst_use(&rth->dst, now); + dst_hold_and_use(&rth->dst, now); spin_unlock_bh(&dn_rt_hash_table[hash].lock); dst_release_immediate(&rt->dst); @@ -351,7 +351,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt); - dst_use(&rt->dst, now); + dst_hold_and_use(&rt->dst, now); spin_unlock_bh(&dn_rt_hash_table[hash].lock); *rp = rt; return 0; @@ -1258,7 +1258,7 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn * (flp->flowidn_mark == rt->fld.flowidn_mark) && dn_is_output_route(rt) && (rt->fld.flowidn_oif == flp->flowidn_oif)) { - dst_use(&rt->dst, jiffies); + dst_hold_and_use(&rt->dst, jiffies); rcu_read_unlock_bh(); *pprt = &rt->dst; return 0; @@ -1535,7 +1535,7 @@ static int dn_route_input(struct sk_buff *skb) (rt->fld.flowidn_oif == 0) && (rt->fld.flowidn_mark == skb->mark) && (rt->fld.flowidn_iif == cb->iif)) { - dst_use(&rt->dst, jiffies); + dst_hold_and_use(&rt->dst, jiffies); rcu_read_unlock(); skb_dst_set(skb, (struct dst_entry *)rt); return 0; -- cgit v1.2.3 From d01174fcd2c1ffefdd0554f847c4045a5c731591 Mon Sep 17 00:00:00 2001 From: Xue Liu Date: Tue, 10 Oct 2017 17:17:45 +0200 Subject: ieee802154: netlink: fix typo of the name of struct genl_ops Signed-off-by: Xue Liu Signed-off-by: Stefan Schmidt --- net/ieee802154/netlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index 6bde9e5a5503..96636e3b7aa9 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -89,7 +89,7 @@ int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info) return genlmsg_reply(msg, info); } -static const struct genl_ops ieee8021154_ops[] = { +static const struct genl_ops ieee802154_ops[] = { /* see nl-phy.c */ IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy, ieee802154_dump_phy), @@ -137,8 +137,8 @@ struct genl_family nl802154_family __ro_after_init = { .version = 1, .maxattr = IEEE802154_ATTR_MAX, .module = THIS_MODULE, - .ops = ieee8021154_ops, - .n_ops = ARRAY_SIZE(ieee8021154_ops), + .ops = ieee802154_ops, + .n_ops = ARRAY_SIZE(ieee802154_ops), .mcgrps = ieee802154_mcgrps, .n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps), }; -- cgit v1.2.3 From bc28df6e8543d41abfb48d4a4c6d445638d6768c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 15 Oct 2017 13:22:10 -0500 Subject: net: dccp: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that for options.c file, I placed the "fall through" comment on its own line, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/dccp/input.c | 1 + net/dccp/options.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/input.c b/net/dccp/input.c index fa6be9750bb4..d28d46bff6ab 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -534,6 +534,7 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk, case DCCP_PKT_DATA: if (sk->sk_state == DCCP_RESPOND) break; + /* fall through */ case DCCP_PKT_DATAACK: case DCCP_PKT_ACK: /* diff --git a/net/dccp/options.c b/net/dccp/options.c index 51cdfc3bd8ca..4e40db017e19 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -227,8 +227,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, * Ack vectors are processed by the TX CCID if it is * interested. The RX CCID need not parse Ack Vectors, * since it is only interested in clearing old state. - * Fall through. */ + /* fall through */ case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, pkt_type, opt, value, len)) -- cgit v1.2.3 From d85969f1a981b9cd57f5037ebcb9c6d385c0bc70 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 16 Oct 2017 14:33:21 +0100 Subject: tcp: cdg: make struct tcp_cdg static The structure tcp_cdg is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'tcp_cdg' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/ipv4/tcp_cdg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 66ac69f7bd19..06fbe102a425 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -389,7 +389,7 @@ static void tcp_cdg_release(struct sock *sk) kfree(ca->gradients); } -struct tcp_congestion_ops tcp_cdg __read_mostly = { +static struct tcp_congestion_ops tcp_cdg __read_mostly = { .cong_avoid = tcp_cdg_cong_avoid, .cwnd_event = tcp_cdg_cwnd_event, .pkts_acked = tcp_cdg_acked, -- cgit v1.2.3 From 070cbf5be7774dcf0ceca081c7321c6f2ae833a4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Oct 2017 15:44:35 +0200 Subject: rtnetlink: place link af dump into own helper next patch will rcu-ify rtnl af_ops, i.e. allow af_ops lookup and function calls with rcu read lock held instead of rtnl mutex. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 72 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6a09f3d575af..a49cad25e577 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1382,6 +1382,47 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb, return 0; } +static int rtnl_fill_link_af(struct sk_buff *skb, + const struct net_device *dev, + u32 ext_filter_mask) +{ + const struct rtnl_af_ops *af_ops; + struct nlattr *af_spec; + + af_spec = nla_nest_start(skb, IFLA_AF_SPEC); + if (!af_spec) + return -EMSGSIZE; + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + struct nlattr *af; + int err; + + if (!af_ops->fill_link_af) + continue; + + af = nla_nest_start(skb, af_ops->family); + if (!af) + return -EMSGSIZE; + + err = af_ops->fill_link_af(skb, dev, ext_filter_mask); + /* + * Caller may return ENODATA to indicate that there + * was no data to be dumped. This is not an error, it + * means we should trim the attribute header and + * continue. + */ + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + return -EMSGSIZE; + + nla_nest_end(skb, af); + } + + nla_nest_end(skb, af_spec); + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, @@ -1389,8 +1430,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; - struct nlattr *af_spec; - struct rtnl_af_ops *af_ops; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1477,36 +1516,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; - if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) + if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure; - list_for_each_entry(af_ops, &rtnl_af_ops, list) { - if (af_ops->fill_link_af) { - struct nlattr *af; - int err; - - if (!(af = nla_nest_start(skb, af_ops->family))) - goto nla_put_failure; - - err = af_ops->fill_link_af(skb, dev, ext_filter_mask); - - /* - * Caller may return ENODATA to indicate that there - * was no data to be dumped. This is not an error, it - * means we should trim the attribute header and - * continue. - */ - if (err == -ENODATA) - nla_nest_cancel(skb, af); - else if (err < 0) - goto nla_put_failure; - - nla_nest_end(skb, af); - } - } - - nla_nest_end(skb, af_spec); - nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From 5fa85a09390c4a525cb4d06a0c4644b01a47976b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Oct 2017 15:44:36 +0200 Subject: net: core: rcu-ify rtnl af_ops rtnl af_ops currently rely on rtnl mutex: unregister (called from module exit functions) takes the rtnl mutex and all users that do af_ops lookup also take the rtnl mutex. IOW, parallel rmmod will block until doit() callback is done. As none of the af_ops implementation sleep we can use rcu instead. doit functions that need the af_ops can now use rcu instead of the rtnl mutex provided the mutex isn't needed for other reasons. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 62 ++++++++++++++++++++++++++++++++++++++-------------- net/ipv4/devinet.c | 4 ++-- 2 files changed, 48 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a49cad25e577..20b550d07fe3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -453,7 +453,7 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) { const struct rtnl_af_ops *ops; - list_for_each_entry(ops, &rtnl_af_ops, list) { + list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { if (ops->family == family) return ops; } @@ -470,7 +470,7 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) void rtnl_af_register(struct rtnl_af_ops *ops) { rtnl_lock(); - list_add_tail(&ops->list, &rtnl_af_ops); + list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); } EXPORT_SYMBOL_GPL(rtnl_af_register); @@ -482,8 +482,10 @@ EXPORT_SYMBOL_GPL(rtnl_af_register); void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); - list_del(&ops->list); + list_del_rcu(&ops->list); rtnl_unlock(); + + synchronize_rcu(); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); @@ -496,13 +498,15 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev, /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); - list_for_each_entry(af_ops, &rtnl_af_ops, list) { + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + af_ops->get_link_af_size(dev, ext_filter_mask); } } + rcu_read_unlock(); return size; } @@ -1393,7 +1397,7 @@ static int rtnl_fill_link_af(struct sk_buff *skb, if (!af_spec) return -EMSGSIZE; - list_for_each_entry(af_ops, &rtnl_af_ops, list) { + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { struct nlattr *af; int err; @@ -1516,12 +1520,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; + rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) - goto nla_put_failure; + goto nla_put_failure_rcu; + rcu_read_unlock(); nlmsg_end(skb, nlh); return 0; +nla_put_failure_rcu: + rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; @@ -1783,17 +1791,27 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; - if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + rcu_read_lock(); + af_ops = rtnl_af_lookup(nla_type(af)); + if (!af_ops) { + rcu_read_unlock(); return -EAFNOSUPPORT; + } - if (!af_ops->set_link_af) + if (!af_ops->set_link_af) { + rcu_read_unlock(); return -EOPNOTSUPP; + } if (af_ops->validate_link_af) { err = af_ops->validate_link_af(dev, af); - if (err < 0) + if (err < 0) { + rcu_read_unlock(); return err; + } } + + rcu_read_unlock(); } } @@ -2251,13 +2269,18 @@ static int do_setlink(const struct sk_buff *skb, nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; + rcu_read_lock(); + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) BUG(); err = af_ops->set_link_af(dev, af); - if (err < 0) + if (err < 0) { + rcu_read_unlock(); goto errout; + } + rcu_read_unlock(); status |= DO_SETLINK_NOTIFY; } } @@ -4004,25 +4027,30 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (!attr) goto nla_put_failure; - list_for_each_entry(af_ops, &rtnl_af_ops, list) { + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; int err; af = nla_nest_start(skb, af_ops->family); - if (!af) + if (!af) { + rcu_read_unlock(); goto nla_put_failure; - + } err = af_ops->fill_stats_af(skb, dev); - if (err == -ENODATA) + if (err == -ENODATA) { nla_nest_cancel(skb, af); - else if (err < 0) + } else if (err < 0) { + rcu_read_unlock(); goto nla_put_failure; + } nla_nest_end(skb, af); } } + rcu_read_unlock(); nla_nest_end(skb, attr); @@ -4091,7 +4119,8 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, /* for IFLA_STATS_AF_SPEC */ size += nla_total_size(0); - list_for_each_entry(af_ops, &rtnl_af_ops, list) { + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_stats_af_size) { size += nla_total_size( af_ops->get_stats_af_size(dev)); @@ -4100,6 +4129,7 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, size += nla_total_size(0); } } + rcu_read_unlock(); } return size; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7ce22a2c07ce..6d9b072d903b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1757,7 +1757,7 @@ static int inet_validate_link_af(const struct net_device *dev, struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; - if (dev && !__in_dev_get_rtnl(dev)) + if (dev && !__in_dev_get_rcu(dev)) return -EAFNOSUPPORT; err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL); @@ -1781,7 +1781,7 @@ static int inet_validate_link_af(const struct net_device *dev, static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; -- cgit v1.2.3 From 36c0a9dfc6613242ba1de012e2d15145cdaae805 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 16 Oct 2017 16:04:51 +0200 Subject: tipc: fix rebasing error In commit 2f487712b893 ("tipc: guarantee that group broadcast doesn't bypass group unicast") there was introduced a last-minute rebasing error that broke non-group communication. We fix this here. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 723dd6998684..870b9b8f877a 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1052,6 +1052,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, return true; } case CONN_MANAGER: + skb_queue_tail(inputq, skb); return true; case GROUP_PROTOCOL: skb_queue_tail(mc_inputq, skb); -- cgit v1.2.3 From 6a4bc44b012cbc29c9d824be2c7ab9eac8ee6b6f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 16 Oct 2017 09:31:47 +0200 Subject: batman-adv: Avoid spurious warnings from bat_v neigh_cmp implementation The neighbor compare API implementation for B.A.T.M.A.N. V checks whether the neigh_ifinfo for this neighbor on a specific interface exists. A warning is printed when it isn't found. But it is not called inside a lock which would prevent that this information is lost right before batadv_neigh_ifinfo_get. It must therefore be expected that batadv_v_neigh_(cmp|is_sob) might not be able to get the requested neigh_ifinfo. A WARN_ON for such a situation seems not to be appropriate because this will only flood the kernel logs. The warnings must therefore be removed. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 93ef1c06227e..341ceab8338d 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -19,7 +19,6 @@ #include "main.h" #include -#include #include #include #include @@ -623,11 +622,11 @@ static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, int ret = 0; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); - if (WARN_ON(!ifinfo1)) + if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); - if (WARN_ON(!ifinfo2)) + if (!ifinfo2) goto err_ifinfo2; ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput; @@ -649,11 +648,11 @@ static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, bool ret = false; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); - if (WARN_ON(!ifinfo1)) + if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); - if (WARN_ON(!ifinfo2)) + if (!ifinfo2) goto err_ifinfo2; threshold = ifinfo1->bat_v.throughput / 4; -- cgit v1.2.3 From 8a5f2166a6288ee4b5a393f1ebc8cfb26b0510f0 Mon Sep 17 00:00:00 2001 From: Henrik Austad Date: Tue, 17 Oct 2017 12:10:10 +0200 Subject: net: export netdev_txq_to_tc to allow sch_mqprio to compile as module In commit 32302902ff09 ("mqprio: Reserve last 32 classid values for HW traffic classes and misc IDs") sch_mqprio started using netdev_txq_to_tc to find the correct tc instead of dev->tc_to_txq[] However, when mqprio is compiled as a module, it cannot resolve the symbol, leading to this error: ERROR: "netdev_txq_to_tc" [net/sched/sch_mqprio.ko] undefined! This adds an EXPORT_SYMBOL() since the other user in the kernel (netif_set_xps_queue) is also EXPORT_SYMBOL() (and not _GPL) or in a sysfs-callback. Cc: Alexander Duyck Cc: Jesus Sanchez-Palencia Cc: David S. Miller Signed-off-by: Henrik Austad Reviewed-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index fcddccb6be41..d2b20e73080e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2040,6 +2040,7 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) return 0; } +EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS static DEFINE_MUTEX(xps_map_mutex); -- cgit v1.2.3 From 5a6cd6de76ae78b651e7c36eba8b1da465d65f06 Mon Sep 17 00:00:00 2001 From: Alan Brady Date: Thu, 5 Oct 2017 14:53:40 -0700 Subject: ethtool: add ethtool_intersect_link_masks This function provides a way to intersect two link masks together to find the common ground between them. For example in i40e, the driver first generates link masks for what is supported by the PHY type. The driver then gets the link masks for what the NVM supports. The resulting intersection between them yields what can truly be supported. Signed-off-by: Alan Brady Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- net/core/ethtool.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 3228411ada0f..0c406306792a 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -403,6 +403,22 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data) return 0; } +/* Given two link masks, AND them together and save the result in dst. */ +void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, + struct ethtool_link_ksettings *src) +{ + unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS); + unsigned int idx = 0; + + for (; idx < size; idx++) { + dst->link_modes.supported[idx] &= + src->link_modes.supported[idx]; + dst->link_modes.advertising[idx] &= + src->link_modes.advertising[idx]; + } +} +EXPORT_SYMBOL(ethtool_intersect_link_masks); + void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) { -- cgit v1.2.3 From a68f4a27f55f1d54e35c270aff89383da4b1b656 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Oct 2017 11:36:39 +0100 Subject: rxrpc: Support service upgrade from a kernel service Provide support for a kernel service to make use of the service upgrade facility. This involves: (1) Pass an upgrade request flag to rxrpc_kernel_begin_call(). (2) Make rxrpc_kernel_recv_data() return the call's current service ID so that the caller can detect service upgrade and see what the service was upgraded to. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 5 ++++- net/rxrpc/recvmsg.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index fb17552fd292..481f7dc90ba2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -265,6 +265,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @tx_total_len: Total length of data to transmit during the call (or -1) * @gfp: The allocation constraints * @notify_rx: Where to send notifications instead of socket queue + * @upgrade: Request service upgrade for call * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and @@ -279,7 +280,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, unsigned long user_call_ID, s64 tx_total_len, gfp_t gfp, - rxrpc_notify_rx_t notify_rx) + rxrpc_notify_rx_t notify_rx, + bool upgrade) { struct rxrpc_conn_parameters cp; struct rxrpc_call *call; @@ -304,6 +306,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.key = key; cp.security_level = 0; cp.exclusive = false; + cp.upgrade = upgrade; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, tx_total_len, gfp); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index bdece21f313d..e4937b3f3685 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -607,6 +607,7 @@ wait_error: * @_offset: The running offset into the buffer. * @want_more: True if more data is expected to be read * @_abort: Where the abort code is stored if -ECONNABORTED is returned + * @_service: Where to store the actual service ID (may be upgraded) * * Allow a kernel service to receive data and pick up information about the * state of a call. Returns 0 if got what was asked for and there's more @@ -624,7 +625,7 @@ wait_error: */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, void *buf, size_t size, size_t *_offset, - bool want_more, u32 *_abort) + bool want_more, u32 *_abort, u16 *_service) { struct iov_iter iter; struct kvec iov; @@ -680,6 +681,8 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, read_phase_complete: ret = 1; out: + if (_service) + *_service = call->service_id; mutex_unlock(&call->user_mutex); _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); return ret; -- cgit v1.2.3 From f4d15fb6f99af9b99f688bd87579137be44f85ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Oct 2017 11:07:31 +0100 Subject: rxrpc: Provide functions for allowing cleaner handling of signals Provide a couple of functions to allow cleaner handling of signals in a kernel service. They are: (1) rxrpc_kernel_get_rtt() This allows the kernel service to find out the RTT time for a call, so as to better judge how large a timeout to employ. Note, though, that whilst this returns a value in nanoseconds, the timeouts can only actually be in jiffies. (2) rxrpc_kernel_check_life() This returns a number that is updated when ACKs are received from the peer (notably including PING RESPONSE ACKs which we can elicit by sending PING ACKs to see if the call still exists on the server). The caller should compare the numbers of two calls to see if the call is still alive. These can be used to provide an extending timeout rather than returning immediately in the case that a signal occurs that would otherwise abort an RPC operation. The timeout would be extended if the server is still responsive and the call is still apparently alive on the server. For most operations this isn't that necessary - but for FS.StoreData it is: OpenAFS writes the data to storage as it comes in without making a backup, so if we immediately abort it when partially complete on a CTRL+C, say, we have no idea of the state of the file after the abort. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 19 +++++++++++++++++++ net/rxrpc/peer_object.c | 13 +++++++++++++ 2 files changed, 32 insertions(+) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 481f7dc90ba2..73c980e26581 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -339,6 +339,25 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) } EXPORT_SYMBOL(rxrpc_kernel_end_call); +/** + * rxrpc_kernel_check_life - Check to see whether a call is still alive + * @sock: The socket the call is on + * @call: The call to check + * + * Allow a kernel service to find out whether a call is still alive - ie. we're + * getting ACKs from the server. Returns a number representing the life state + * which can be compared to that returned by a previous call. + * + * If this is a client call, ping ACKs will be sent to the server to find out + * whether it's still responsive and whether the call is still alive on the + * server. + */ +u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call) +{ + return call->acks_latest; +} +EXPORT_SYMBOL(rxrpc_kernel_check_life); + /** * rxrpc_kernel_check_call - Check a call's state * @sock: The socket the call is on diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 5787f97f5330..d02a99f37f5f 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -411,3 +411,16 @@ void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, *_srx = call->peer->srx; } EXPORT_SYMBOL(rxrpc_kernel_get_peer); + +/** + * rxrpc_kernel_get_rtt - Get a call's peer RTT + * @sock: The socket on which the call is in progress. + * @call: The call to query + * + * Get the call's peer RTT. + */ +u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call) +{ + return call->peer->rtt; +} +EXPORT_SYMBOL(rxrpc_kernel_get_rtt); -- cgit v1.2.3 From bc5e3a546d553e5223851fc199e69040eb70f68b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Oct 2017 11:07:31 +0100 Subject: rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals Make AF_RXRPC accept MSG_WAITALL as a flag to sendmsg() to tell it to ignore signals whilst loading up the message queue, provided progress is being made in emptying the queue at the other side. Progress is defined as the base of the transmit window having being advanced within 2 RTT periods. If the period is exceeded with no progress, sendmsg() will return anyway, indicating how much data has been copied, if any. Once the supplied buffer is entirely decanted, the sendmsg() will return. Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 107 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 81 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 9ea6f972767e..2d9edc656ca3 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -37,13 +37,87 @@ struct rxrpc_send_params { bool upgrade; /* If the connection is upgradeable */ }; +/* + * Wait for space to appear in the Tx queue or a signal to occur. + */ +static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, + struct rxrpc_call *call, + long *timeo) +{ + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (call->tx_top - call->tx_hard_ack < + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra)) + return 0; + + if (call->state >= RXRPC_CALL_COMPLETE) + return call->error; + + if (signal_pending(current)) + return sock_intr_errno(*timeo); + + trace_rxrpc_transmit(call, rxrpc_transmit_wait); + mutex_unlock(&call->user_mutex); + *timeo = schedule_timeout(*timeo); + if (mutex_lock_interruptible(&call->user_mutex) < 0) + return sock_intr_errno(*timeo); + } +} + +/* + * Wait for space to appear in the Tx queue uninterruptibly, but with + * a timeout of 2*RTT if no progress was made and a signal occurred. + */ +static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, + struct rxrpc_call *call) +{ + rxrpc_seq_t tx_start, tx_win; + signed long rtt2, timeout; + u64 rtt; + + rtt = READ_ONCE(call->peer->rtt); + rtt2 = nsecs_to_jiffies64(rtt) * 2; + if (rtt2 < 1) + rtt2 = 1; + + timeout = rtt2; + tx_start = READ_ONCE(call->tx_hard_ack); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + + tx_win = READ_ONCE(call->tx_hard_ack); + if (call->tx_top - tx_win < + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra)) + return 0; + + if (call->state >= RXRPC_CALL_COMPLETE) + return call->error; + + if (timeout == 0 && + tx_win == tx_start && signal_pending(current)) + return -EINTR; + + if (tx_win != tx_start) { + timeout = rtt2; + tx_start = tx_win; + } + + trace_rxrpc_transmit(call, rxrpc_transmit_wait); + timeout = schedule_timeout(timeout); + } +} + /* * wait for space to appear in the transmit/ACK window * - caller holds the socket locked */ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, struct rxrpc_call *call, - long *timeo) + long *timeo, + bool waitall) { DECLARE_WAITQUEUE(myself, current); int ret; @@ -53,30 +127,10 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, add_wait_queue(&call->waitq, &myself); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - ret = 0; - if (call->tx_top - call->tx_hard_ack < - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) - break; - if (call->state >= RXRPC_CALL_COMPLETE) { - ret = call->error; - break; - } - if (signal_pending(current)) { - ret = sock_intr_errno(*timeo); - break; - } - - trace_rxrpc_transmit(call, rxrpc_transmit_wait); - mutex_unlock(&call->user_mutex); - *timeo = schedule_timeout(*timeo); - if (mutex_lock_interruptible(&call->user_mutex) < 0) { - ret = sock_intr_errno(*timeo); - break; - } - } + if (waitall) + ret = rxrpc_wait_for_tx_window_nonintr(rx, call); + else + ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); remove_wait_queue(&call->waitq, &myself); set_current_state(TASK_RUNNING); @@ -254,7 +308,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; ret = rxrpc_wait_for_tx_window(rx, call, - &timeo); + &timeo, + msg->msg_flags & MSG_WAITALL); if (ret < 0) goto maybe_error; } -- cgit v1.2.3 From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:34 +0200 Subject: bpf: XDP_REDIRECT enable use of cpumap This patch connects cpumap to the xdp_do_redirect_map infrastructure. Still no SKB allocation are done yet. The XDP frames are transferred to the other CPU, but they are simply refcnt decremented on the remote CPU. This served as a good benchmark for measuring the overhead of remote refcnt decrement. If driver page recycle cache is not efficient then this, exposes a bottleneck in the page allocator. A shout-out to MST's ptr_ring, which is the secret behind is being so efficient to transfer memory pointers between CPUs, without constantly bouncing cache-lines between CPUs. V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot. V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet, as implementation require a separate upstream discussion. V5: - Fix a maybe-uninitialized pointed out by kbuild test robot. - Restrict bpf-prog side access to cpumap, open when use-cases appear - Implement cpu_map_enqueue() as a more simple void pointer enqueue V6: - Allow cpumap type for usage in helper bpf_redirect_map, general bpf-prog side restriction moved to earlier patch. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/filter.c | 140 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 111 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 140fa9f9c0f4..4d88e0665c41 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2526,10 +2526,36 @@ static int __bpf_tx_xdp(struct net_device *dev, err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); if (err) return err; - if (map) + dev->netdev_ops->ndo_xdp_flush(dev); + return 0; +} + +static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, + struct bpf_map *map, + struct xdp_buff *xdp, + u32 index) +{ + int err; + + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + struct net_device *dev = fwd; + + if (!dev->netdev_ops->ndo_xdp_xmit) + return -EOPNOTSUPP; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + if (err) + return err; __dev_map_insert_ctx(map, index); - else - dev->netdev_ops->ndo_xdp_flush(dev); + + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + struct bpf_cpu_map_entry *rcpu = fwd; + + err = cpu_map_enqueue(rcpu, xdp, dev_rx); + if (err) + return err; + __cpu_map_insert_ctx(map, index); + } return 0; } @@ -2539,11 +2565,33 @@ void xdp_do_flush_map(void) struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; - if (map) - __dev_map_flush(map); + if (map) { + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: + __dev_map_flush(map); + break; + case BPF_MAP_TYPE_CPUMAP: + __cpu_map_flush(map); + break; + default: + break; + } + } } EXPORT_SYMBOL_GPL(xdp_do_flush_map); +static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: + return __dev_map_lookup_elem(map, index); + case BPF_MAP_TYPE_CPUMAP: + return __cpu_map_lookup_elem(map, index); + default: + return NULL; + } +} + static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, unsigned long aux) { @@ -2556,8 +2604,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; - struct net_device *fwd = NULL; u32 index = ri->ifindex; + void *fwd = NULL; int err; ri->ifindex = 0; @@ -2570,7 +2618,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, goto err; } - fwd = __dev_map_lookup_elem(map, index); + fwd = __xdp_map_lookup_elem(map, index); if (!fwd) { err = -EINVAL; goto err; @@ -2578,7 +2626,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, if (ri->map_to_flush && ri->map_to_flush != map) xdp_do_flush_map(); - err = __bpf_tx_xdp(fwd, map, xdp, index); + err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); if (unlikely(err)) goto err; @@ -2620,54 +2668,88 @@ err: } EXPORT_SYMBOL_GPL(xdp_do_redirect); -int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) +static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) +{ + unsigned int len; + + if (unlikely(!(fwd->flags & IFF_UP))) + return -ENETDOWN; + + len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; + if (skb->len > len) + return -EMSGSIZE; + + return 0; +} + +int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; - unsigned int len; int err = 0; ri->ifindex = 0; ri->map = NULL; ri->map_owner = 0; - if (map) { - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } - fwd = __dev_map_lookup_elem(map, index); - } else { - fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { + err = -EFAULT; + map = NULL; + goto err; } + fwd = __xdp_map_lookup_elem(map, index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } - if (unlikely(!(fwd->flags & IFF_UP))) { - err = -ENETDOWN; + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + goto err; + skb->dev = fwd; + } else { + /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ + err = -EBADRQC; goto err; } - len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) { - err = -EMSGSIZE; + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + return err; +} + +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + u32 index = ri->ifindex; + struct net_device *fwd; + int err = 0; + + if (ri->map) + return xdp_do_generic_redirect_map(dev, skb, xdp_prog); + + ri->ifindex = 0; + fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(!fwd)) { + err = -EINVAL; goto err; } + if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + goto err; + skb->dev = fwd; - map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index) - : _trace_xdp_redirect(dev, xdp_prog, index); + _trace_xdp_redirect(dev, xdp_prog, index); return 0; err: - map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err) - : _trace_xdp_redirect_err(dev, xdp_prog, index, err); + _trace_xdp_redirect_err(dev, xdp_prog, index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); -- cgit v1.2.3 From 1c601d829ab0d7ac3ac44853f83db2206afe67fc Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:39 +0200 Subject: bpf: cpumap xdp_buff to skb conversion and allocation This patch makes cpumap functional, by adding SKB allocation and invoking the network stack on the dequeuing CPU. For constructing the SKB on the remote CPU, the xdp_buff in converted into a struct xdp_pkt, and it mapped into the top headroom of the packet, to avoid allocating separate mem. For now, struct xdp_pkt is just a cpumap internal data structure, with info carried between enqueue to dequeue. If a driver doesn't have enough headroom it is simply dropped, with return code -EOVERFLOW. This will be picked up the xdp tracepoint infrastructure, to allow users to catch this. V2: take into account xdp->data_meta V4: - Drop busypoll tricks, keeping it more simple. - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei V5: correct RCU read protection around __netif_receive_skb_core. V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d2b20e73080e..cf5894f0e6eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4492,6 +4492,33 @@ out: return ret; } +/** + * netif_receive_skb_core - special purpose version of netif_receive_skb + * @skb: buffer to process + * + * More direct receive version of netif_receive_skb(). It should + * only be used by callers that have a need to skip RPS and Generic XDP. + * Caller must also take care of handling if (page_is_)pfmemalloc. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb_core(struct sk_buff *skb) +{ + int ret; + + rcu_read_lock(); + ret = __netif_receive_skb_core(skb, false); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(netif_receive_skb_core); + static int __netif_receive_skb(struct sk_buff *skb) { int ret; -- cgit v1.2.3 From a2084f5650624edd0805dc78260d097df4f38eb6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 16 Oct 2017 16:57:49 +0200 Subject: netlink: use NETLINK_CB(in_skb).sk instead of looking it up When netlink_ack() reports an allocation error to the sending socket, there's no need to look up the sending socket since it's available in the SKB's CB. Use that instead of going to the trouble of looking it up. Note that the pointer is only available since Eric Biederman's commit 3fbc290540a1 ("netlink: Make the sending netlink socket availabe in NETLINK_CB") which is far newer than the original lookup code (Oct 2003) (though the field was called 'ssk' in that commit and only got renamed to 'sk' later, I'd actually argue 'ssk' was better - or perhaps it should've been 'source_sk' - since there are so many different 'sk's involved.) Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f34750691c5c..336d9c6dcad9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2336,16 +2336,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) { - struct sock *sk; - - sk = netlink_lookup(sock_net(in_skb->sk), - in_skb->sk->sk_protocol, - NETLINK_CB(in_skb).portid); - if (sk) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - sock_put(sk); - } + NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; + NETLINK_CB(in_skb).sk->sk_error_report(NETLINK_CB(in_skb).sk); return; } -- cgit v1.2.3 From a5b930e0598dc46184af0282c23c75b2167d9384 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:13 -0400 Subject: net: dsa: use port's cpu_dp when creating a slave When dsa_slave_create is called, the related port already has a CPU port assigned to it, available in its cpu_dp member. Use it instead of the unique tree cpu_dp. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 45f4ea845c07..c6f4829645bf 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1117,16 +1117,13 @@ int dsa_slave_resume(struct net_device *slave_dev) int dsa_slave_create(struct dsa_port *port, const char *name) { struct dsa_notifier_register_info rinfo = { }; + struct dsa_port *cpu_dp = port->cpu_dp; + struct net_device *master = cpu_dp->netdev; struct dsa_switch *ds = port->ds; - struct net_device *master; struct net_device *slave_dev; struct dsa_slave_priv *p; - struct dsa_port *cpu_dp; int ret; - cpu_dp = ds->dst->cpu_dp; - master = cpu_dp->netdev; - if (!ds->num_tx_queues) ds->num_tx_queues = 1; -- cgit v1.2.3 From 6158eaa7a717b469b1b0c0ae6d79910737686279 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:14 -0400 Subject: net: dsa: add slave notify helper Both DSA slave create and destroy functions call call_dsa_notifiers with respectively DSA_PORT_REGISTER and DSA_PORT_UNREGISTER and the same dsa_notifier_register_info structure. Wrap this in a dsa_slave_notify helper so prevent cluttering these functions. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index c6f4829645bf..f31737256f69 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1114,9 +1114,23 @@ int dsa_slave_resume(struct net_device *slave_dev) return 0; } +static void dsa_slave_notify(struct net_device *dev, unsigned long val) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct net_device *master = dsa_master_netdev(p); + struct dsa_port *dp = p->dp; + struct dsa_notifier_register_info rinfo = { + .switch_number = dp->ds->index, + .port_number = dp->index, + .master = master, + .info.dev = dev, + }; + + call_dsa_notifiers(val, dev, &rinfo.info); +} + int dsa_slave_create(struct dsa_port *port, const char *name) { - struct dsa_notifier_register_info rinfo = { }; struct dsa_port *cpu_dp = port->cpu_dp; struct net_device *master = cpu_dp->netdev; struct dsa_switch *ds = port->ds; @@ -1175,11 +1189,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name) goto out_free; } - rinfo.info.dev = slave_dev; - rinfo.master = master; - rinfo.port_number = p->dp->index; - rinfo.switch_number = p->dp->ds->index; - call_dsa_notifiers(DSA_PORT_REGISTER, slave_dev, &rinfo.info); + dsa_slave_notify(slave_dev, DSA_PORT_REGISTER); ret = register_netdev(slave_dev); if (ret) { @@ -1204,7 +1214,6 @@ out_free: void dsa_slave_destroy(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); - struct dsa_notifier_register_info rinfo = { }; struct device_node *port_dn; port_dn = p->dp->dn; @@ -1216,11 +1225,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) if (of_phy_is_fixed_link(port_dn)) of_phy_deregister_fixed_link(port_dn); } - rinfo.info.dev = slave_dev; - rinfo.master = p->dp->cpu_dp->netdev; - rinfo.port_number = p->dp->index; - rinfo.switch_number = p->dp->ds->index; - call_dsa_notifiers(DSA_PORT_UNREGISTER, slave_dev, &rinfo.info); + dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); unregister_netdev(slave_dev); free_percpu(p->stats64); free_netdev(slave_dev); -- cgit v1.2.3 From d945097bb19501bd95790d8220d4eeb418b6ebb2 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:15 -0400 Subject: net: dsa: add slave to port helper Many portions of DSA core code require to get the dsa_port structure corresponding to a slave net_device. For this purpose, introduce a dsa_slave_to_port() helper. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 7 +++ net/dsa/legacy.c | 6 +- net/dsa/slave.c | 167 +++++++++++++++++++++++++------------------------- net/dsa/tag_brcm.c | 9 ++- net/dsa/tag_dsa.c | 10 +-- net/dsa/tag_edsa.c | 10 +-- net/dsa/tag_ksz.c | 4 +- net/dsa/tag_lan9303.c | 4 +- net/dsa/tag_mtk.c | 4 +- net/dsa/tag_qca.c | 5 +- net/dsa/tag_trailer.c | 4 +- 11 files changed, 115 insertions(+), 115 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 2850077cc9cc..569a4929b4c9 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -169,6 +169,13 @@ int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); +static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + return p->dp; +} + /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index b0fefbffe082..cc28c6f792a3 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -740,8 +740,7 @@ int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], const unsigned char *addr, u16 vid, u16 flags) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); return dsa_port_fdb_add(dp, addr, vid); } @@ -750,8 +749,7 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); return dsa_port_fdb_del(dp, addr, vid); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f31737256f69..894602c88b09 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -72,8 +72,8 @@ static int dsa_slave_get_iflink(const struct net_device *dev) static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; struct net_device *master = dsa_master_netdev(p); + struct dsa_port *dp = dsa_slave_to_port(dev); int err; if (!(master->flags & IFF_UP)) @@ -122,7 +122,7 @@ static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = dsa_master_netdev(p); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); if (dev->phydev) phy_stop(dev->phydev); @@ -246,14 +246,13 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_dump_ctx dump = { .dev = dev, .skb = skb, .cb = cb, .idx = *idx, }; - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; int err; err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump); @@ -274,8 +273,7 @@ static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); int ret; switch (attr->id) { @@ -301,8 +299,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); int err; /* For the prepare phase, ensure the full set of changes is feasable in @@ -329,8 +326,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, static int dsa_slave_port_obj_del(struct net_device *dev, const struct switchdev_obj *obj) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); int err; switch (obj->id) { @@ -351,8 +347,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev, static int dsa_slave_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: @@ -431,11 +427,11 @@ static void dsa_slave_get_drvinfo(struct net_device *dev, static int dsa_slave_get_regs_len(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs_len) - return ds->ops->get_regs_len(ds, p->dp->index); + return ds->ops->get_regs_len(ds, dp->index); return -EOPNOTSUPP; } @@ -443,11 +439,11 @@ static int dsa_slave_get_regs_len(struct net_device *dev) static void dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs) - ds->ops->get_regs(ds, p->dp->index, regs, _p); + ds->ops->get_regs(ds, dp->index, regs, _p); } static u32 dsa_slave_get_link(struct net_device *dev) @@ -462,8 +458,8 @@ static u32 dsa_slave_get_link(struct net_device *dev) static int dsa_slave_get_eeprom_len(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; @@ -477,8 +473,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev) static int dsa_slave_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->get_eeprom) return ds->ops->get_eeprom(ds, eeprom, data); @@ -489,8 +485,8 @@ static int dsa_slave_get_eeprom(struct net_device *dev, static int dsa_slave_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->set_eeprom) return ds->ops->set_eeprom(ds, eeprom, data); @@ -501,8 +497,8 @@ static int dsa_slave_set_eeprom(struct net_device *dev, static void dsa_slave_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (stringset == ETH_SS_STATS) { int len = ETH_GSTRING_LEN; @@ -512,7 +508,7 @@ static void dsa_slave_get_strings(struct net_device *dev, strncpy(data + 2 * len, "rx_packets", len); strncpy(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) - ds->ops->get_strings(ds, p->dp->index, data + 4 * len); + ds->ops->get_strings(ds, dp->index, data + 4 * len); } } @@ -520,8 +516,9 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; int i; @@ -543,13 +540,13 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, data[3] += rx_bytes; } if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, p->dp->index, data + 4); + ds->ops->get_ethtool_stats(ds, dp->index, data + 4); } static int dsa_slave_get_sset_count(struct net_device *dev, int sset) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (sset == ETH_SS_STATS) { int count; @@ -566,29 +563,29 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->get_wol) - ds->ops->get_wol(ds, p->dp->index, w); + ds->ops->get_wol(ds, dp->index, w); } static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; if (ds->ops->set_wol) - ret = ds->ops->set_wol(ds, p->dp->index, w); + ret = ds->ops->set_wol(ds, dp->index, w); return ret; } static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; int ret; /* Port's PHY and MAC both need to be EEE capable */ @@ -598,7 +595,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) if (!ds->ops->set_mac_eee) return -EOPNOTSUPP; - ret = ds->ops->set_mac_eee(ds, p->dp->index, e); + ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; @@ -613,8 +610,8 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; int ret; /* Port's PHY and MAC both need to be EEE capable */ @@ -624,7 +621,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) if (!ds->ops->get_mac_eee) return -EOPNOTSUPP; - ret = ds->ops->get_mac_eee(ds, p->dp->index, e); + ret = ds->ops->get_mac_eee(ds, dp->index, e); if (ret) return ret; @@ -676,9 +673,9 @@ static void dsa_slave_poll_controller(struct net_device *dev) static int dsa_slave_get_phys_port_name(struct net_device *dev, char *name, size_t len) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); - if (snprintf(name, len, "p%d", p->dp->index) >= len) + if (snprintf(name, len, "p%d", dp->index) >= len) return -EINVAL; return 0; @@ -701,14 +698,15 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; __be16 protocol = cls->common.protocol; - struct dsa_switch *ds = p->dp->ds; struct net *net = dev_net(dev); - struct dsa_slave_priv *to_p; + struct dsa_switch *ds = dp->ds; struct net_device *to_dev; const struct tc_action *a; + struct dsa_port *to_dp; int err = -EOPNOTSUPP; LIST_HEAD(actions); int ifindex; @@ -741,13 +739,12 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; - to_p = netdev_priv(to_dev); + to_dp = dsa_slave_to_port(to_dev); - mirror->to_local_port = to_p->dp->index; + mirror->to_local_port = to_dp->index; mirror->ingress = ingress; - err = ds->ops->port_mirror_add(ds, p->dp->index, mirror, - ingress); + err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress); if (err) { kfree(mall_tc_entry); return err; @@ -762,9 +759,9 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, static void dsa_slave_del_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_mall_tc_entry *mall_tc_entry; - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; if (!ds->ops->port_mirror_del) return; @@ -777,8 +774,7 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev, switch (mall_tc_entry->type) { case DSA_PORT_MALL_MIRROR: - ds->ops->port_mirror_del(ds, p->dp->index, - &mall_tc_entry->mirror); + ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror); break; default: WARN_ON(1); @@ -855,25 +851,25 @@ static void dsa_slave_get_stats64(struct net_device *dev, static int dsa_slave_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (!ds->ops->get_rxnfc) return -EOPNOTSUPP; - return ds->ops->get_rxnfc(ds, p->dp->index, nfc, rule_locs); + return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs); } static int dsa_slave_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (!ds->ops->set_rxnfc) return -EOPNOTSUPP; - return ds->ops->set_rxnfc(ds, p->dp->index, nfc); + return ds->ops->set_rxnfc(ds, dp->index, nfc); } static const struct ethtool_ops dsa_slave_ethtool_ops = { @@ -933,8 +929,9 @@ static struct device_type dsa_type = { static void dsa_slave_adjust_link(struct net_device *dev) { + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; unsigned int status_changed = 0; if (p->old_link != dev->phydev->link) { @@ -953,7 +950,7 @@ static void dsa_slave_adjust_link(struct net_device *dev) } if (ds->ops->adjust_link && status_changed) - ds->ops->adjust_link(ds, p->dp->index, dev->phydev); + ds->ops->adjust_link(ds, dp->index, dev->phydev); if (status_changed) phy_print_status(dev->phydev); @@ -962,14 +959,14 @@ static void dsa_slave_adjust_link(struct net_device *dev) static int dsa_slave_fixed_link_update(struct net_device *dev, struct fixed_phy_status *status) { - struct dsa_slave_priv *p; struct dsa_switch *ds; + struct dsa_port *dp; if (dev) { - p = netdev_priv(dev); - ds = p->dp->ds; + dp = dsa_slave_to_port(dev); + ds = dp->ds; if (ds->ops->fixed_link_update) - ds->ops->fixed_link_update(ds, p->dp->index, status); + ds->ops->fixed_link_update(ds, dp->index, status); } return 0; @@ -978,8 +975,9 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, /* slave device setup *******************************************************/ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) { + struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_slave_priv *p = netdev_priv(slave_dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr); if (!slave_dev->phydev) { @@ -997,14 +995,15 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) static int dsa_slave_phy_setup(struct net_device *slave_dev) { + struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_slave_priv *p = netdev_priv(slave_dev); - struct dsa_switch *ds = p->dp->ds; - struct device_node *phy_dn, *port_dn; + struct device_node *port_dn = dp->dn; + struct dsa_switch *ds = dp->ds; + struct device_node *phy_dn; bool phy_is_fixed = false; u32 phy_flags = 0; int mode, ret; - port_dn = p->dp->dn; mode = of_get_phy_mode(port_dn); if (mode < 0) mode = PHY_INTERFACE_MODE_NA; @@ -1025,7 +1024,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) } if (ds->ops->get_phy_flags) - phy_flags = ds->ops->get_phy_flags(ds, p->dp->index); + phy_flags = ds->ops->get_phy_flags(ds, dp->index); if (phy_dn) { int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); @@ -1061,10 +1060,10 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) * MDIO bus instead */ if (!slave_dev->phydev) { - ret = dsa_slave_phy_connect(slave_dev, p->dp->index); + ret = dsa_slave_phy_connect(slave_dev, dp->index); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", - p->dp->index, ret); + dp->index, ret); if (phy_is_fixed) of_phy_deregister_fixed_link(port_dn); return ret; @@ -1118,7 +1117,7 @@ static void dsa_slave_notify(struct net_device *dev, unsigned long val) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = dsa_master_netdev(p); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_notifier_register_info rinfo = { .switch_number = dp->ds->index, .port_number = dp->index, @@ -1202,8 +1201,8 @@ int dsa_slave_create(struct dsa_port *port, const char *name) out_phy: phy_disconnect(slave_dev->phydev); - if (of_phy_is_fixed_link(p->dp->dn)) - of_phy_deregister_fixed_link(p->dp->dn); + if (of_phy_is_fixed_link(port->dn)) + of_phy_deregister_fixed_link(port->dn); out_free: free_percpu(p->stats64); free_netdev(slave_dev); @@ -1213,10 +1212,9 @@ out_free: void dsa_slave_destroy(struct net_device *slave_dev) { + struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_slave_priv *p = netdev_priv(slave_dev); - struct device_node *port_dn; - - port_dn = p->dp->dn; + struct device_node *port_dn = dp->dn; netif_carrier_off(slave_dev); if (slave_dev->phydev) { @@ -1239,8 +1237,7 @@ static bool dsa_slave_dev_check(struct net_device *dev) static int dsa_slave_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); int err = NOTIFY_DONE; if (netif_is_bridge_master(info->upper_dev)) { @@ -1283,14 +1280,14 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) container_of(work, struct dsa_switchdev_event_work, work); struct net_device *dev = switchdev_work->dev; struct switchdev_notifier_fdb_info *fdb_info; - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); int err; rtnl_lock(); switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; - err = dsa_port_fdb_add(p->dp, fdb_info->addr, fdb_info->vid); + err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid); if (err) { netdev_dbg(dev, "fdb add failed err=%d\n", err); break; @@ -1301,7 +1298,7 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) case SWITCHDEV_FDB_DEL_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; - err = dsa_port_fdb_del(p->dp, fdb_info->addr, fdb_info->vid); + err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid); if (err) { netdev_dbg(dev, "fdb del failed err=%d\n", err); dev_close(dev); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index cc4f472fbd77..2a0fa9f9f7ca 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -61,7 +61,7 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u16 queue = skb_get_queue_mapping(skb); u8 *brcm_tag; @@ -82,15 +82,14 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev ((queue & BRCM_IG_TC_MASK) << BRCM_IG_TC_SHIFT); brcm_tag[1] = 0; brcm_tag[2] = 0; - if (p->dp->index == 8) + if (dp->index == 8) brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; - brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK; + brcm_tag[3] = (1 << dp->index) & BRCM_IG_DSTMAP1_MASK; /* Now tell the master network device about the desired output queue * as well */ - skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(p->dp->index, - queue)); + skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(dp->index, queue)); return skb; } diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index c77218f173d1..fe3c9700a8c8 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -18,7 +18,7 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u8 *dsa_header; /* @@ -34,8 +34,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct tagged FROM_CPU DSA tag from 802.1q tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x60 | p->dp->ds->index; - dsa_header[1] = p->dp->index << 3; + dsa_header[0] = 0x60 | dp->ds->index; + dsa_header[1] = dp->index << 3; /* * Move CFI field from byte 2 to byte 1. @@ -55,8 +55,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct untagged FROM_CPU DSA tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x40 | p->dp->ds->index; - dsa_header[1] = p->dp->index << 3; + dsa_header[0] = 0x40 | dp->ds->index; + dsa_header[1] = dp->index << 3; dsa_header[2] = 0x00; dsa_header[3] = 0x00; } diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 0b83cbe0c9e8..c683e240bafc 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -19,7 +19,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u8 *edsa_header; /* @@ -43,8 +43,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x60 | p->dp->ds->index; - edsa_header[5] = p->dp->index << 3; + edsa_header[4] = 0x60 | dp->ds->index; + edsa_header[5] = dp->index << 3; /* * Move CFI field from byte 6 to byte 5. @@ -68,8 +68,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x40 | p->dp->ds->index; - edsa_header[5] = p->dp->index << 3; + edsa_header[4] = 0x40 | dp->ds->index; + edsa_header[5] = dp->index << 3; edsa_header[6] = 0x00; edsa_header[7] = 0x00; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index b241c990cde0..66e443fd7675 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -34,7 +34,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); struct sk_buff *nskb; int padlen; u8 *tag; @@ -72,7 +72,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); tag[0] = 0; - tag[1] = 1 << p->dp->index; /* destination port */ + tag[1] = 1 << dp->index; /* destination port */ return nskb; } diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 4f211e56c81f..c709adb5efd9 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -42,7 +42,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u16 *lan9303_tag; /* insert a special VLAN tag between the MAC addresses @@ -62,7 +62,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) lan9303_tag = (u16 *)(skb->data + 2 * ETH_ALEN); lan9303_tag[0] = htons(ETH_P_8021Q); - lan9303_tag[1] = htons(p->dp->index | BIT(4)); + lan9303_tag[1] = htons(dp->index | BIT(4)); return skb; } diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 968586c5d40e..5abebbfd7060 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -23,7 +23,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u8 *mtk_tag; if (skb_cow_head(skb, MTK_HDR_LEN) < 0) @@ -36,7 +36,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, /* Build the tag after the MAC Source Address */ mtk_tag = skb->data + 2 * ETH_ALEN; mtk_tag[0] = 0; - mtk_tag[1] = (1 << p->dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; + mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; mtk_tag[2] = 0; mtk_tag[3] = 0; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 8d33d9ebf910..b0274b6781cb 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -38,7 +38,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u16 *phdr, hdr; dev->stats.tx_packets++; @@ -54,8 +54,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) /* Set the version field, and set destination port information */ hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | - QCA_HDR_XMIT_FROM_CPU | - BIT(p->dp->index); + QCA_HDR_XMIT_FROM_CPU | BIT(dp->index); *phdr = htons(hdr); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 61668be267f5..8b205bb5f521 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -16,7 +16,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); struct sk_buff *nskb; int padlen; u8 *trailer; @@ -48,7 +48,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) trailer = skb_put(nskb, 4); trailer[0] = 0x80; - trailer[1] = 1 << p->dp->index; + trailer[1] = 1 << dp->index; trailer[2] = 0x10; trailer[3] = 0x00; -- cgit v1.2.3 From d0006b002208936d36af8e4dce1f6dfeebb2dfba Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:16 -0400 Subject: net: dsa: add slave to master helper Many part of the DSA slave code require to get the master device assigned to a slave device. Remove dsa_master_netdev() in favor of a dsa_slave_to_master() helper which does that. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 13 ++++++++----- net/dsa/slave.c | 26 +++++++++----------------- 2 files changed, 17 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 569a4929b4c9..eedda1f1b099 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -176,6 +176,14 @@ static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) return p->dp; } +static inline struct net_device * +dsa_slave_to_master(const struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return dp->cpu_dp->netdev; +} + /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); @@ -204,9 +212,4 @@ extern const struct dsa_device_ops qca_netdev_ops; /* tag_trailer.c */ extern const struct dsa_device_ops trailer_netdev_ops; -static inline struct net_device *dsa_master_netdev(struct dsa_slave_priv *p) -{ - return p->dp->cpu_dp->netdev; -} - #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 894602c88b09..b72e07503a40 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -64,15 +64,12 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) /* slave device handling ****************************************************/ static int dsa_slave_get_iflink(const struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - - return dsa_master_netdev(p)->ifindex; + return dsa_slave_to_master(dev)->ifindex; } static int dsa_slave_open(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); int err; @@ -120,8 +117,7 @@ out: static int dsa_slave_close(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); if (dev->phydev) @@ -144,8 +140,7 @@ static int dsa_slave_close(struct net_device *dev) static void dsa_slave_change_rx_flags(struct net_device *dev, int change) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); if (change & IFF_ALLMULTI) dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); @@ -155,8 +150,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change) static void dsa_slave_set_rx_mode(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); dev_mc_sync(master, dev); dev_uc_sync(master, dev); @@ -164,8 +158,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev) static int dsa_slave_set_mac_address(struct net_device *dev, void *a) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); struct sockaddr *addr = a; int err; @@ -409,7 +402,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ - nskb->dev = dsa_master_netdev(p); + nskb->dev = dsa_slave_to_master(dev); dev_queue_xmit(nskb); return NETDEV_TX_OK; @@ -632,8 +625,8 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) static int dsa_slave_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) { + struct net_device *master = dsa_slave_to_master(dev); struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); struct netpoll *netpoll; int err = 0; @@ -1115,8 +1108,7 @@ int dsa_slave_resume(struct net_device *slave_dev) static void dsa_slave_notify(struct net_device *dev, unsigned long val) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_notifier_register_info rinfo = { .switch_number = dp->ds->index, -- cgit v1.2.3 From 2231c43b560403675217f52204b18c1c59c0ee76 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:17 -0400 Subject: net: dsa: rename dsa_master_get_slave The dsa_master_get_slave is slightly confusing since the idiomatic "get" term often suggests reference counting, in symmetry to "put". Rename it to dsa_master_find_slave to make the look up operation clear. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 4 ++-- net/dsa/tag_brcm.c | 2 +- net/dsa/tag_dsa.c | 2 +- net/dsa/tag_edsa.c | 2 +- net/dsa/tag_ksz.c | 2 +- net/dsa/tag_lan9303.c | 2 +- net/dsa/tag_mtk.c | 2 +- net/dsa/tag_qca.c | 2 +- net/dsa/tag_trailer.c | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index eedda1f1b099..623c22b75e81 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -113,8 +113,8 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], int dsa_master_ethtool_setup(struct net_device *dev); void dsa_master_ethtool_restore(struct net_device *dev); -static inline struct net_device *dsa_master_get_slave(struct net_device *dev, - int device, int port) +static inline struct net_device *dsa_master_find_slave(struct net_device *dev, + int device, int port) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 2a0fa9f9f7ca..9e082bae3cb0 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -119,7 +119,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Locate which port this is coming from */ source_port = brcm_tag[3] & BRCM_EG_PID_MASK; - skb->dev = dsa_master_get_slave(dev, 0, source_port); + skb->dev = dsa_master_find_slave(dev, 0, source_port); if (!skb->dev) return NULL; diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index fe3c9700a8c8..dbbcdafed8c3 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -91,7 +91,7 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = dsa_header[0] & 0x1f; source_port = (dsa_header[1] >> 3) & 0x1f; - skb->dev = dsa_master_get_slave(dev, source_device, source_port); + skb->dev = dsa_master_find_slave(dev, source_device, source_port); if (!skb->dev) return NULL; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index c683e240bafc..f38a626b3a05 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -104,7 +104,7 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = edsa_header[0] & 0x1f; source_port = (edsa_header[1] >> 3) & 0x1f; - skb->dev = dsa_master_get_slave(dev, source_device, source_port); + skb->dev = dsa_master_find_slave(dev, source_device, source_port); if (!skb->dev) return NULL; diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 66e443fd7675..0f62effad88f 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -87,7 +87,7 @@ static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, source_port = tag[0] & 7; - skb->dev = dsa_master_get_slave(dev, 0, source_port); + skb->dev = dsa_master_find_slave(dev, 0, source_port); if (!skb->dev) return NULL; diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index c709adb5efd9..57519597c6fc 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -94,7 +94,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, source_port = ntohs(lan9303_tag[1]) & 0x3; - skb->dev = dsa_master_get_slave(dev, 0, source_port); + skb->dev = dsa_master_find_slave(dev, 0, source_port); if (!skb->dev) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n"); return NULL; diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 5abebbfd7060..8475434af7d5 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -69,7 +69,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Get source port information */ port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK); - skb->dev = dsa_master_get_slave(dev, 0, port); + skb->dev = dsa_master_find_slave(dev, 0, port); if (!skb->dev) return NULL; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index b0274b6781cb..613f4ee97771 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -91,7 +91,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Get source port information */ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); - skb->dev = dsa_master_get_slave(dev, 0, port); + skb->dev = dsa_master_find_slave(dev, 0, port); if (!skb->dev) return NULL; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 8b205bb5f521..7d20e1f3de28 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -71,7 +71,7 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, source_port = trailer[1] & 7; - skb->dev = dsa_master_get_slave(dev, 0, source_port); + skb->dev = dsa_master_find_slave(dev, 0, source_port); if (!skb->dev) return NULL; -- cgit v1.2.3 From f8b8b1cd5aadd221742b45eb0ee3c8a80abf036a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:18 -0400 Subject: net: dsa: split dsa_port's netdev member The dsa_port structure has a "netdev" member, which can be used for either the master device, or the slave device, depending on its type. It is true that today, CPU port are not exposed to userspace, thus the port's netdev member can be used to point to its master interface. But it is still slightly confusing, so split it into more explicit "master" and "slave" members inside an anonymous union. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 6 +++--- net/dsa/dsa2.c | 20 ++++++++++---------- net/dsa/dsa_priv.h | 4 ++-- net/dsa/legacy.c | 14 +++++++------- net/dsa/slave.c | 6 +++--- 5 files changed, 25 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 832c659ff993..a3abf7a7b9a2 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -201,7 +201,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_PM_SLEEP static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { - return ds->enabled_port_mask & (1 << p) && ds->ports[p].netdev; + return ds->enabled_port_mask & (1 << p) && ds->ports[p].slave; } int dsa_switch_suspend(struct dsa_switch *ds) @@ -213,7 +213,7 @@ int dsa_switch_suspend(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_suspend(ds->ports[i].netdev); + ret = dsa_slave_suspend(ds->ports[i].slave); if (ret) return ret; } @@ -240,7 +240,7 @@ int dsa_switch_resume(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_resume(ds->ports[i].netdev); + ret = dsa_slave_resume(ds->ports[i].slave); if (ret) return ret; } diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 6ac9e11d385c..9e8b8aab049d 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -279,7 +279,7 @@ static int dsa_user_port_apply(struct dsa_port *port) if (err) { dev_warn(ds->dev, "Failed to create slave %d: %d\n", port->index, err); - port->netdev = NULL; + port->slave = NULL; return err; } @@ -289,7 +289,7 @@ static int dsa_user_port_apply(struct dsa_port *port) if (err) return err; - devlink_port_type_eth_set(&port->devlink_port, port->netdev); + devlink_port_type_eth_set(&port->devlink_port, port->slave); return 0; } @@ -297,9 +297,9 @@ static int dsa_user_port_apply(struct dsa_port *port) static void dsa_user_port_unapply(struct dsa_port *port) { devlink_port_unregister(&port->devlink_port); - if (port->netdev) { - dsa_slave_destroy(port->netdev); - port->netdev = NULL; + if (port->slave) { + dsa_slave_destroy(port->slave); + port->slave = NULL; port->ds->enabled_port_mask &= ~(1 << port->index); } } @@ -432,9 +432,9 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) * sent to the tag format's receive function. */ wmb(); - dst->cpu_dp->netdev->dsa_ptr = dst->cpu_dp; + dst->cpu_dp->master->dsa_ptr = dst->cpu_dp; - err = dsa_master_ethtool_setup(dst->cpu_dp->netdev); + err = dsa_master_ethtool_setup(dst->cpu_dp->master); if (err) return err; @@ -451,9 +451,9 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; - dsa_master_ethtool_restore(dst->cpu_dp->netdev); + dsa_master_ethtool_restore(dst->cpu_dp->master); - dst->cpu_dp->netdev->dsa_ptr = NULL; + dst->cpu_dp->master->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype * field, make sure that all packets from this point get sent @@ -499,7 +499,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, if (!dst->cpu_dp) { dst->cpu_dp = port; - dst->cpu_dp->netdev = ethernet_dev; + dst->cpu_dp->master = ethernet_dev; } /* Initialize cpu_port_mask now for drv->setup() diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 623c22b75e81..1e9914062d0b 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -130,7 +130,7 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, if (port < 0 || port >= ds->num_ports) return NULL; - return ds->ports[port].netdev; + return ds->ports[port].slave; } /* port.c */ @@ -181,7 +181,7 @@ dsa_slave_to_master(const struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - return dp->cpu_dp->netdev; + return dp->cpu_dp->master; } /* switch.c */ diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index cc28c6f792a3..b6c88fd33d4f 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -120,7 +120,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, return -EINVAL; } dst->cpu_dp = &ds->ports[i]; - dst->cpu_dp->netdev = master; + dst->cpu_dp->master = master; ds->cpu_port_mask |= 1 << i; } else if (!strcmp(name, "dsa")) { ds->dsa_port_mask |= 1 << i; @@ -261,10 +261,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (!(ds->enabled_port_mask & (1 << port))) continue; - if (!ds->ports[port].netdev) + if (!ds->ports[port].slave) continue; - dsa_slave_destroy(ds->ports[port].netdev); + dsa_slave_destroy(ds->ports[port].slave); } /* Disable configuration of the CPU and DSA ports */ @@ -601,7 +601,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, wmb(); dev->dsa_ptr = dst->cpu_dp; - return dsa_master_ethtool_setup(dst->cpu_dp->netdev); + return dsa_master_ethtool_setup(dst->cpu_dp->master); } static int dsa_probe(struct platform_device *pdev) @@ -666,9 +666,9 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; - dsa_master_ethtool_restore(dst->cpu_dp->netdev); + dsa_master_ethtool_restore(dst->cpu_dp->master); - dst->cpu_dp->netdev->dsa_ptr = NULL; + dst->cpu_dp->master->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype * field, make sure that all packets from this point get sent @@ -683,7 +683,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dev_put(dst->cpu_dp->netdev); + dev_put(dst->cpu_dp->master); } static int dsa_remove(struct platform_device *pdev) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index b72e07503a40..6906de0f0050 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1123,7 +1123,7 @@ static void dsa_slave_notify(struct net_device *dev, unsigned long val) int dsa_slave_create(struct dsa_port *port, const char *name) { struct dsa_port *cpu_dp = port->cpu_dp; - struct net_device *master = cpu_dp->netdev; + struct net_device *master = cpu_dp->master; struct dsa_switch *ds = port->ds; struct net_device *slave_dev; struct dsa_slave_priv *p; @@ -1170,7 +1170,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name) p->old_link = -1; p->old_duplex = -1; - port->netdev = slave_dev; + port->slave = slave_dev; netif_carrier_off(slave_dev); @@ -1198,7 +1198,7 @@ out_phy: out_free: free_percpu(p->stats64); free_netdev(slave_dev); - port->netdev = NULL; + port->slave = NULL; return ret; } -- cgit v1.2.3 From eb4ddaf474285a4c6986f4a1c3205bdb0bed2da9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:28:45 -0700 Subject: net/decnet: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Johannes Berg Cc: David Ahern Cc: linux-decnet-user@lists.sourceforge.net Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/decnet/af_decnet.c | 4 ---- net/decnet/dn_dev.c | 12 +++++------- net/decnet/dn_nsp_out.c | 11 ----------- 3 files changed, 5 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 73a0399dc7a2..d4c9a8bbad3e 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -533,10 +533,6 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf scp->keepalive = 10 * HZ; scp->keepalive_fxn = dn_keepalive; - init_timer(&scp->delack_timer); - scp->delack_pending = 0; - scp->delack_fxn = dn_nsp_delayed_ack; - dn_start_slow_timer(sk); out: return sk; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 4d339de56862..92dbaa3f1eae 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1038,14 +1038,14 @@ static void dn_eth_down(struct net_device *dev) static void dn_dev_set_timer(struct net_device *dev); -static void dn_dev_timer_func(unsigned long arg) +static void dn_dev_timer_func(struct timer_list *t) { - struct net_device *dev = (struct net_device *)arg; - struct dn_dev *dn_db; + struct dn_dev *dn_db = from_timer(dn_db, t, timer); + struct net_device *dev; struct dn_ifaddr *ifa; rcu_read_lock(); - dn_db = rcu_dereference(dev->dn_ptr); + dev = dn_db->dev; if (dn_db->t3 <= dn_db->parms.t2) { if (dn_db->parms.timer3) { for (ifa = rcu_dereference(dn_db->ifa_list); @@ -1070,8 +1070,6 @@ static void dn_dev_set_timer(struct net_device *dev) if (dn_db->parms.t2 > dn_db->parms.t3) dn_db->parms.t2 = dn_db->parms.t3; - dn_db->timer.data = (unsigned long)dev; - dn_db->timer.function = dn_dev_timer_func; dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ); add_timer(&dn_db->timer); @@ -1100,7 +1098,7 @@ static struct dn_dev *dn_dev_create(struct net_device *dev, int *err) rcu_assign_pointer(dev->dn_ptr, dn_db); dn_db->dev = dev; - init_timer(&dn_db->timer); + timer_setup(&dn_db->timer, dn_dev_timer_func, 0); dn_db->uptime = jiffies; diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 66f035e476ea..e50a4adfcf7e 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -491,17 +491,6 @@ void dn_send_conn_ack (struct sock *sk) dn_nsp_send(skb); } -void dn_nsp_delayed_ack(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - if (scp->ackxmt_oth != scp->numoth_rcv) - dn_nsp_send_oth_ack(sk); - - if (scp->ackxmt_dat != scp->numdat_rcv) - dn_nsp_send_data_ack(sk); -} - static int dn_nsp_retrans_conn_conf(struct sock *sk) { struct dn_scp *scp = DN_SK(sk); -- cgit v1.2.3 From 83a37b3292f4aca799b355179ad6fbdd78a08e10 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:28:46 -0700 Subject: net/lapb: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Hans Liljestrand Cc: "Reshetova, Elena" Cc: linux-x25@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/lapb/lapb_iface.c | 4 ++-- net/lapb/lapb_timer.c | 18 ++++++++---------- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index e15314e3b464..db6e0afe3a20 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -127,8 +127,8 @@ static struct lapb_cb *lapb_create_cb(void) skb_queue_head_init(&lapb->write_queue); skb_queue_head_init(&lapb->ack_queue); - init_timer(&lapb->t1timer); - init_timer(&lapb->t2timer); + timer_setup(&lapb->t1timer, NULL, 0); + timer_setup(&lapb->t2timer, NULL, 0); lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c index 1a5535bc3b8d..8bb469cb3abe 100644 --- a/net/lapb/lapb_timer.c +++ b/net/lapb/lapb_timer.c @@ -35,15 +35,14 @@ #include #include -static void lapb_t1timer_expiry(unsigned long); -static void lapb_t2timer_expiry(unsigned long); +static void lapb_t1timer_expiry(struct timer_list *); +static void lapb_t2timer_expiry(struct timer_list *); void lapb_start_t1timer(struct lapb_cb *lapb) { del_timer(&lapb->t1timer); - lapb->t1timer.data = (unsigned long)lapb; - lapb->t1timer.function = &lapb_t1timer_expiry; + lapb->t1timer.function = (TIMER_FUNC_TYPE)lapb_t1timer_expiry; lapb->t1timer.expires = jiffies + lapb->t1; add_timer(&lapb->t1timer); @@ -53,8 +52,7 @@ void lapb_start_t2timer(struct lapb_cb *lapb) { del_timer(&lapb->t2timer); - lapb->t2timer.data = (unsigned long)lapb; - lapb->t2timer.function = &lapb_t2timer_expiry; + lapb->t2timer.function = (TIMER_FUNC_TYPE)lapb_t2timer_expiry; lapb->t2timer.expires = jiffies + lapb->t2; add_timer(&lapb->t2timer); @@ -75,9 +73,9 @@ int lapb_t1timer_running(struct lapb_cb *lapb) return timer_pending(&lapb->t1timer); } -static void lapb_t2timer_expiry(unsigned long param) +static void lapb_t2timer_expiry(struct timer_list *t) { - struct lapb_cb *lapb = (struct lapb_cb *)param; + struct lapb_cb *lapb = from_timer(lapb, t, t2timer); if (lapb->condition & LAPB_ACK_PENDING_CONDITION) { lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; @@ -85,9 +83,9 @@ static void lapb_t2timer_expiry(unsigned long param) } } -static void lapb_t1timer_expiry(unsigned long param) +static void lapb_t1timer_expiry(struct timer_list *t) { - struct lapb_cb *lapb = (struct lapb_cb *)param; + struct lapb_cb *lapb = from_timer(lapb, t, t1timer); switch (lapb->state) { -- cgit v1.2.3 From 4966babd904d7f8e9e20735f3637a98fd7ca538c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:28:47 -0700 Subject: net/rose: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Ralf Baechle Cc: "David S. Miller" Cc: linux-hams@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/rose/af_rose.c | 17 +++++++++-------- net/rose/rose_link.c | 16 +++++++--------- net/rose/rose_loopback.c | 9 +++------ net/rose/rose_route.c | 8 ++++---- net/rose/rose_timer.c | 30 +++++++++++++----------------- 5 files changed, 36 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 4a9729257023..6a5c4992cf61 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -318,9 +318,11 @@ void rose_destroy_socket(struct sock *); /* * Handler for deferred kills. */ -static void rose_destroy_timer(unsigned long data) +static void rose_destroy_timer(struct timer_list *t) { - rose_destroy_socket((struct sock *)data); + struct sock *sk = from_timer(sk, t, sk_timer); + + rose_destroy_socket(sk); } /* @@ -353,8 +355,7 @@ void rose_destroy_socket(struct sock *sk) if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ - setup_timer(&sk->sk_timer, rose_destroy_timer, - (unsigned long)sk); + timer_setup(&sk->sk_timer, rose_destroy_timer, 0); sk->sk_timer.expires = jiffies + 10 * HZ; add_timer(&sk->sk_timer); } else @@ -538,8 +539,8 @@ static int rose_create(struct net *net, struct socket *sock, int protocol, sock->ops = &rose_proto_ops; sk->sk_protocol = protocol; - init_timer(&rose->timer); - init_timer(&rose->idletimer); + timer_setup(&rose->timer, NULL, 0); + timer_setup(&rose->idletimer, NULL, 0); rose->t1 = msecs_to_jiffies(sysctl_rose_call_request_timeout); rose->t2 = msecs_to_jiffies(sysctl_rose_reset_request_timeout); @@ -582,8 +583,8 @@ static struct sock *rose_make_new(struct sock *osk) sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); - init_timer(&rose->timer); - init_timer(&rose->idletimer); + timer_setup(&rose->timer, NULL, 0); + timer_setup(&rose->idletimer, NULL, 0); orose = rose_sk(osk); rose->t1 = orose->t1; diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index c76638cc2cd5..cda4c6678ef1 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -27,8 +27,8 @@ #include #include -static void rose_ftimer_expiry(unsigned long); -static void rose_t0timer_expiry(unsigned long); +static void rose_ftimer_expiry(struct timer_list *); +static void rose_t0timer_expiry(struct timer_list *); static void rose_transmit_restart_confirmation(struct rose_neigh *neigh); static void rose_transmit_restart_request(struct rose_neigh *neigh); @@ -37,8 +37,7 @@ void rose_start_ftimer(struct rose_neigh *neigh) { del_timer(&neigh->ftimer); - neigh->ftimer.data = (unsigned long)neigh; - neigh->ftimer.function = &rose_ftimer_expiry; + neigh->ftimer.function = (TIMER_FUNC_TYPE)rose_ftimer_expiry; neigh->ftimer.expires = jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout); @@ -49,8 +48,7 @@ static void rose_start_t0timer(struct rose_neigh *neigh) { del_timer(&neigh->t0timer); - neigh->t0timer.data = (unsigned long)neigh; - neigh->t0timer.function = &rose_t0timer_expiry; + neigh->t0timer.function = (TIMER_FUNC_TYPE)rose_t0timer_expiry; neigh->t0timer.expires = jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout); @@ -77,13 +75,13 @@ static int rose_t0timer_running(struct rose_neigh *neigh) return timer_pending(&neigh->t0timer); } -static void rose_ftimer_expiry(unsigned long param) +static void rose_ftimer_expiry(struct timer_list *t) { } -static void rose_t0timer_expiry(unsigned long param) +static void rose_t0timer_expiry(struct timer_list *t) { - struct rose_neigh *neigh = (struct rose_neigh *)param; + struct rose_neigh *neigh = from_timer(neigh, t, t0timer); rose_transmit_restart_request(neigh); diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 344456206b70..7af4f99c4a93 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -19,12 +19,13 @@ static struct sk_buff_head loopback_queue; static struct timer_list loopback_timer; static void rose_set_loopback_timer(void); +static void rose_loopback_timer(struct timer_list *unused); void rose_loopback_init(void) { skb_queue_head_init(&loopback_queue); - init_timer(&loopback_timer); + timer_setup(&loopback_timer, rose_loopback_timer, 0); } static int rose_loopback_running(void) @@ -50,20 +51,16 @@ int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh) return 1; } -static void rose_loopback_timer(unsigned long); static void rose_set_loopback_timer(void) { del_timer(&loopback_timer); - loopback_timer.data = 0; - loopback_timer.function = &rose_loopback_timer; loopback_timer.expires = jiffies + 10; - add_timer(&loopback_timer); } -static void rose_loopback_timer(unsigned long param) +static void rose_loopback_timer(struct timer_list *unused) { struct sk_buff *skb; struct net_device *dev; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 452bbb38d943..65921cd10323 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -104,8 +104,8 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, skb_queue_head_init(&rose_neigh->queue); - init_timer(&rose_neigh->ftimer); - init_timer(&rose_neigh->t0timer); + timer_setup(&rose_neigh->ftimer, NULL, 0); + timer_setup(&rose_neigh->t0timer, NULL, 0); if (rose_route->ndigis != 0) { rose_neigh->digipeat = @@ -390,8 +390,8 @@ void rose_add_loopback_neigh(void) skb_queue_head_init(&sn->queue); - init_timer(&sn->ftimer); - init_timer(&sn->t0timer); + timer_setup(&sn->ftimer, NULL, 0); + timer_setup(&sn->t0timer, NULL, 0); spin_lock_bh(&rose_neigh_list_lock); sn->next = rose_neigh_list; diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index bc5469d6d9cb..3b89d66f15bb 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -29,8 +29,8 @@ #include static void rose_heartbeat_expiry(unsigned long); -static void rose_timer_expiry(unsigned long); -static void rose_idletimer_expiry(unsigned long); +static void rose_timer_expiry(struct timer_list *); +static void rose_idletimer_expiry(struct timer_list *); void rose_start_heartbeat(struct sock *sk) { @@ -49,8 +49,7 @@ void rose_start_t1timer(struct sock *sk) del_timer(&rose->timer); - rose->timer.data = (unsigned long)sk; - rose->timer.function = &rose_timer_expiry; + rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; rose->timer.expires = jiffies + rose->t1; add_timer(&rose->timer); @@ -62,8 +61,7 @@ void rose_start_t2timer(struct sock *sk) del_timer(&rose->timer); - rose->timer.data = (unsigned long)sk; - rose->timer.function = &rose_timer_expiry; + rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; rose->timer.expires = jiffies + rose->t2; add_timer(&rose->timer); @@ -75,8 +73,7 @@ void rose_start_t3timer(struct sock *sk) del_timer(&rose->timer); - rose->timer.data = (unsigned long)sk; - rose->timer.function = &rose_timer_expiry; + rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; rose->timer.expires = jiffies + rose->t3; add_timer(&rose->timer); @@ -88,8 +85,7 @@ void rose_start_hbtimer(struct sock *sk) del_timer(&rose->timer); - rose->timer.data = (unsigned long)sk; - rose->timer.function = &rose_timer_expiry; + rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; rose->timer.expires = jiffies + rose->hb; add_timer(&rose->timer); @@ -102,8 +98,7 @@ void rose_start_idletimer(struct sock *sk) del_timer(&rose->idletimer); if (rose->idle > 0) { - rose->idletimer.data = (unsigned long)sk; - rose->idletimer.function = &rose_idletimer_expiry; + rose->idletimer.function = (TIMER_FUNC_TYPE)rose_idletimer_expiry; rose->idletimer.expires = jiffies + rose->idle; add_timer(&rose->idletimer); @@ -163,10 +158,10 @@ static void rose_heartbeat_expiry(unsigned long param) bh_unlock_sock(sk); } -static void rose_timer_expiry(unsigned long param) +static void rose_timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; - struct rose_sock *rose = rose_sk(sk); + struct rose_sock *rose = from_timer(rose, t, timer); + struct sock *sk = &rose->sock; bh_lock_sock(sk); switch (rose->state) { @@ -192,9 +187,10 @@ static void rose_timer_expiry(unsigned long param) bh_unlock_sock(sk); } -static void rose_idletimer_expiry(unsigned long param) +static void rose_idletimer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct rose_sock *rose = from_timer(rose, t, idletimer); + struct sock *sk = &rose->sock; bh_lock_sock(sk); rose_clear_queues(sk); -- cgit v1.2.3 From c3aed70953c39274bb55fe98e9b8344af1a7af75 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:28:56 -0700 Subject: xfrm: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() helper to pass the timer pointer explicitly. Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f06253969972..4838329bb43a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -57,7 +57,7 @@ static __read_mostly seqcount_t xfrm_policy_hash_generation; static void xfrm_init_pmtu(struct dst_entry *dst); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); -static void xfrm_policy_queue_process(unsigned long arg); +static void xfrm_policy_queue_process(struct timer_list *t); static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, @@ -179,9 +179,9 @@ static inline unsigned long make_jiffies(long secs) return secs*HZ; } -static void xfrm_policy_timer(unsigned long data) +static void xfrm_policy_timer(struct timer_list *t) { - struct xfrm_policy *xp = (struct xfrm_policy *)data; + struct xfrm_policy *xp = from_timer(xp, t, timer); unsigned long now = get_seconds(); long next = LONG_MAX; int warn = 0; @@ -267,10 +267,9 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) rwlock_init(&policy->lock); refcount_set(&policy->refcnt, 1); skb_queue_head_init(&policy->polq.hold_queue); - setup_timer(&policy->timer, xfrm_policy_timer, - (unsigned long)policy); - setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process, - (unsigned long)policy); + timer_setup(&policy->timer, xfrm_policy_timer, 0); + timer_setup(&policy->polq.hold_timer, + xfrm_policy_queue_process, 0); } return policy; } @@ -1852,12 +1851,12 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, return xdst; } -static void xfrm_policy_queue_process(unsigned long arg) +static void xfrm_policy_queue_process(struct timer_list *t) { struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; - struct xfrm_policy *pol = (struct xfrm_policy *)arg; + struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; -- cgit v1.2.3 From 1fccb565e8b09e54467d41111f6faf08fcc9c3c1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:06 -0700 Subject: net: can: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Oliver Hartkopp Cc: Marc Kleine-Budde Cc: "David S. Miller" Cc: linux-can@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/can/af_can.c | 4 ++-- net/can/af_can.h | 2 +- net/can/proc.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 88edac0f3e36..1f75e11ac35a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -882,8 +882,8 @@ static int can_pernet_init(struct net *net) if (IS_ENABLED(CONFIG_PROC_FS)) { /* the statistics are updated every second (timer triggered) */ if (stats_timer) { - setup_timer(&net->can.can_stattimer, can_stat_update, - (unsigned long)net); + timer_setup(&net->can.can_stattimer, can_stat_update, + 0); mod_timer(&net->can.can_stattimer, round_jiffies(jiffies + HZ)); } diff --git a/net/can/af_can.h b/net/can/af_can.h index d0ef45bb2a72..eca6463c6213 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -113,6 +113,6 @@ struct s_pstats { /* function prototypes for the CAN networklayer procfs (proc.c) */ void can_init_proc(struct net *net); void can_remove_proc(struct net *net); -void can_stat_update(unsigned long data); +void can_stat_update(struct timer_list *t); #endif /* AF_CAN_H */ diff --git a/net/can/proc.c b/net/can/proc.c index 83045f00c63c..d979b3dc49a6 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -115,9 +115,9 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, return rate; } -void can_stat_update(unsigned long data) +void can_stat_update(struct timer_list *t) { - struct net *net = (struct net *)data; + struct net *net = from_timer(net, t, can.can_stattimer); struct s_stats *can_stats = net->can.can_stats; unsigned long j = jiffies; /* snapshot */ @@ -221,7 +221,7 @@ static int can_stats_proc_show(struct seq_file *m, void *v) seq_putc(m, '\n'); - if (net->can.can_stattimer.function == can_stat_update) { + if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) { seq_printf(m, " %8ld %% total match ratio (RXMR)\n", can_stats->total_rx_match_ratio); @@ -291,7 +291,7 @@ static int can_reset_stats_proc_show(struct seq_file *m, void *v) user_reset = 1; - if (net->can.can_stattimer.function == can_stat_update) { + if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) { seq_printf(m, "Scheduled statistic reset #%ld.\n", can_pstats->stats_reset + 1); } else { -- cgit v1.2.3 From cdeabbb881343c1d554b83687a71f45280c592e0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:17 -0700 Subject: net: sched: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Add pointer back to Qdisc. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/sched/cls_flow.c | 7 +++---- net/sched/sch_generic.c | 6 +++--- net/sched/sch_pie.c | 10 ++++++---- net/sched/sch_red.c | 10 ++++++---- net/sched/sch_sfq.c | 10 +++++----- 5 files changed, 23 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index f3be6667a253..6b29cef90bec 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -345,9 +345,9 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; } -static void flow_perturbation(unsigned long arg) +static void flow_perturbation(struct timer_list *t) { - struct flow_filter *f = (struct flow_filter *)arg; + struct flow_filter *f = from_timer(f, t, perturb_timer); get_random_bytes(&f->hashrnd, 4); if (f->perturb_period) @@ -505,8 +505,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, get_random_bytes(&fnew->hashrnd, 4); } - setup_deferrable_timer(&fnew->perturb_timer, flow_perturbation, - (unsigned long)fnew); + timer_setup(&fnew->perturb_timer, flow_perturbation, TIMER_DEFERRABLE); netif_keep_dst(qdisc_dev(tp->q)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a0a198768aad..6ced7c89c141 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -288,9 +288,9 @@ unsigned long dev_trans_start(struct net_device *dev) } EXPORT_SYMBOL(dev_trans_start); -static void dev_watchdog(unsigned long arg) +static void dev_watchdog(struct timer_list *t) { - struct net_device *dev = (struct net_device *)arg; + struct net_device *dev = from_timer(dev, t, watchdog_timer); netif_tx_lock(dev); if (!qdisc_tx_is_noop(dev)) { @@ -954,7 +954,7 @@ void dev_init_scheduler(struct net_device *dev) if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); + timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } static void shutdown_scheduler_queue(struct net_device *dev, diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 6c2791d6102d..776c694c77c7 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -74,6 +74,7 @@ struct pie_sched_data { struct pie_vars vars; struct pie_stats stats; struct timer_list adapt_timer; + struct Qdisc *sch; }; static void pie_params_init(struct pie_params *params) @@ -422,10 +423,10 @@ static void calculate_probability(struct Qdisc *sch) pie_vars_init(&q->vars); } -static void pie_timer(unsigned long arg) +static void pie_timer(struct timer_list *t) { - struct Qdisc *sch = (struct Qdisc *)arg; - struct pie_sched_data *q = qdisc_priv(sch); + struct pie_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -446,7 +447,8 @@ static int pie_init(struct Qdisc *sch, struct nlattr *opt) pie_vars_init(&q->vars); sch->limit = q->params.limit; - setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); + q->sch = sch; + timer_setup(&q->adapt_timer, pie_timer, 0); if (opt) { int err = pie_change(sch, opt); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 93b9d70a9b28..fdfdb56aaae2 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -40,6 +40,7 @@ struct red_sched_data { u32 limit; /* HARD maximal queue length */ unsigned char flags; struct timer_list adapt_timer; + struct Qdisc *sch; struct red_parms parms; struct red_vars vars; struct red_stats stats; @@ -221,10 +222,10 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static inline void red_adaptative_timer(unsigned long arg) +static inline void red_adaptative_timer(struct timer_list *t) { - struct Qdisc *sch = (struct Qdisc *)arg; - struct red_sched_data *q = qdisc_priv(sch); + struct red_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -238,7 +239,8 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt) struct red_sched_data *q = qdisc_priv(sch); q->qdisc = &noop_qdisc; - setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch); + q->sch = sch; + timer_setup(&q->adapt_timer, red_adaptative_timer, 0); return red_change(sch, opt); } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 123a53af2506..5a9c95149c5c 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -145,6 +145,7 @@ struct sfq_sched_data { int perturb_period; unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ struct timer_list perturb_timer; + struct Qdisc *sch; }; /* @@ -604,10 +605,10 @@ drop: qdisc_tree_reduce_backlog(sch, dropped, drop_len); } -static void sfq_perturbation(unsigned long arg) +static void sfq_perturbation(struct timer_list *t) { - struct Qdisc *sch = (struct Qdisc *)arg; - struct sfq_sched_data *q = qdisc_priv(sch); + struct sfq_sched_data *q = from_timer(q, t, perturb_timer); + struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -722,8 +723,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) int i; int err; - setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, - (unsigned long)sch); + timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) -- cgit v1.2.3 From a92c5751b97cca55d8140ec0bf26a53c7e00bfa5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:18 -0700 Subject: netfilter: ipset: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This introduces a pointer back to the struct ip_set, which is used instead of the struct timer_list .data field. Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: "David S. Miller" Cc: Stephen Hemminger Cc: simran singhal Cc: Muhammad Falak R Wani Cc: netfilter-devel@vger.kernel.org Cc: coreteam@netfilter.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/netfilter/ipset/ip_set_bitmap_gen.h | 10 +++++----- net/netfilter/ipset/ip_set_bitmap_ip.c | 2 ++ net/netfilter/ipset/ip_set_bitmap_ipmac.c | 2 ++ net/netfilter/ipset/ip_set_bitmap_port.c | 2 ++ net/netfilter/ipset/ip_set_hash_gen.h | 12 +++++++----- net/netfilter/ipset/ip_set_list_set.c | 12 +++++++----- 6 files changed, 25 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 8ad2b52a0b32..5ca18f07683b 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -37,11 +37,11 @@ #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void -mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct mtype *map = set->data; - setup_timer(&map->gc, gc, (unsigned long)set); + timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } @@ -272,10 +272,10 @@ out: } static void -mtype_gc(unsigned long ul_set) +mtype_gc(struct timer_list *t) { - struct ip_set *set = (struct ip_set *)ul_set; - struct mtype *map = set->data; + struct mtype *map = from_timer(map, t, gc); + struct ip_set *set = map->set; void *x; u32 id; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 4783efff0bde..d8975a0b4282 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -48,6 +48,7 @@ struct bitmap_ip { size_t memsize; /* members size */ u8 netmask; /* subnet netmask */ struct timer_list gc; /* garbage collection */ + struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* data extensions */ __aligned(__alignof__(u64)); }; @@ -232,6 +233,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->netmask = netmask; set->timeout = IPSET_NO_TIMEOUT; + map->set = set; set->data = map; set->family = NFPROTO_IPV4; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 9a065f672d3a..4c279fbd2d5d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -52,6 +52,7 @@ struct bitmap_ipmac { u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ + struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* MAC + data extensions */ __aligned(__alignof__(u64)); }; @@ -307,6 +308,7 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, map->elements = elements; set->timeout = IPSET_NO_TIMEOUT; + map->set = set; set->data = map; set->family = NFPROTO_IPV4; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 7f0c733358a4..7f9bbd7c98b5 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -40,6 +40,7 @@ struct bitmap_port { u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ + struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* data extensions */ __aligned(__alignof__(u64)); }; @@ -214,6 +215,7 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->last_port = last_port; set->timeout = IPSET_NO_TIMEOUT; + map->set = set; set->data = map; set->family = NFPROTO_UNSPEC; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 51063d9ed0f7..efffc8eabafe 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -280,6 +280,7 @@ htable_bits(u32 hashsize) struct htype { struct htable __rcu *table; /* the hash table */ struct timer_list gc; /* garbage collection when timeout enabled */ + struct ip_set *set; /* attached to this ip_set */ u32 maxelem; /* max elements in the hash */ u32 initval; /* random jhash init value */ #ifdef IP_SET_HASH_WITH_MARKMASK @@ -429,11 +430,11 @@ mtype_destroy(struct ip_set *set) } static void -mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct htype *h = set->data; - setup_timer(&h->gc, gc, (unsigned long)set); + timer_setup(&h->gc, gc, 0); mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); pr_debug("gc initialized, run in every %u\n", IPSET_GC_PERIOD(set->timeout)); @@ -526,10 +527,10 @@ mtype_expire(struct ip_set *set, struct htype *h) } static void -mtype_gc(unsigned long ul_set) +mtype_gc(struct timer_list *t) { - struct ip_set *set = (struct ip_set *)ul_set; - struct htype *h = set->data; + struct htype *h = from_timer(h, t, gc); + struct ip_set *set = h->set; pr_debug("called\n"); spin_lock_bh(&set->lock); @@ -1314,6 +1315,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, t->htable_bits = hbits; RCU_INIT_POINTER(h->table, t); + h->set = set; set->data = h; #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 178d4eba013b..c9b4e05ad940 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -44,6 +44,7 @@ struct set_adt_elem { struct list_set { u32 size; /* size of set list array */ struct timer_list gc; /* garbage collection */ + struct ip_set *set; /* attached to this ip_set */ struct net *net; /* namespace */ struct list_head members; /* the set members */ }; @@ -571,10 +572,10 @@ static const struct ip_set_type_variant set_variant = { }; static void -list_set_gc(unsigned long ul_set) +list_set_gc(struct timer_list *t) { - struct ip_set *set = (struct ip_set *)ul_set; - struct list_set *map = set->data; + struct list_set *map = from_timer(map, t, gc); + struct ip_set *set = map->set; spin_lock_bh(&set->lock); set_cleanup_entries(set); @@ -585,11 +586,11 @@ list_set_gc(unsigned long ul_set) } static void -list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +list_set_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct list_set *map = set->data; - setup_timer(&map->gc, gc, (unsigned long)set); + timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } @@ -606,6 +607,7 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) map->size = size; map->net = net; + map->set = set; INIT_LIST_HEAD(&map->members); set->data = map; -- cgit v1.2.3 From 59f379f9046a9e0532ffd19b44e3c32fe79ec51b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:19 -0700 Subject: inet/connection_sock: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Gerrit Renker Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: netdev@vger.kernel.org Cc: dccp@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/dccp/timer.c | 18 ++++++++++-------- net/ipv4/inet_connection_sock.c | 21 +++++++++------------ net/ipv4/tcp_timer.c | 18 +++++++++++------- 3 files changed, 30 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 3a2c34027758..1e35526bf436 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -125,10 +125,11 @@ static void dccp_retransmit_timer(struct sock *sk) __sk_dst_reset(sk); } -static void dccp_write_timer(unsigned long data) +static void dccp_write_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; - struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; int event = 0; bh_lock_sock(sk); @@ -161,19 +162,20 @@ out: sock_put(sk); } -static void dccp_keepalive_timer(unsigned long data) +static void dccp_keepalive_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct sock *sk = from_timer(sk, t, sk_timer); pr_err("dccp should not use a keepalive timer !\n"); sock_put(sk); } /* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ -static void dccp_delack_timer(unsigned long data) +static void dccp_delack_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; - struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8a91ebbf0c01..5c965ecc96a0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -494,17 +494,15 @@ EXPORT_SYMBOL(inet_csk_accept); * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)) + void (*retransmit_handler)(struct timer_list *t), + void (*delack_handler)(struct timer_list *t), + void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); - setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, - (unsigned long)sk); - setup_timer(&icsk->icsk_delack_timer, delack_handler, - (unsigned long)sk); - setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); + timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); + timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); + timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); @@ -676,9 +674,9 @@ void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); -static void reqsk_timer_handler(unsigned long data) +static void reqsk_timer_handler(struct timer_list *t) { - struct request_sock *req = (struct request_sock *)data; + struct request_sock *req = from_timer(req, t, rsk_timer); struct sock *sk_listener = req->rsk_listener; struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); @@ -749,8 +747,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, req->num_timeout = 0; req->sk = NULL; - setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler, - (unsigned long)req); + timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7014cc00c74c..804a8d34ce86 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -288,15 +288,17 @@ out: * * Returns: Nothing (void) */ -static void tcp_delack_timer(unsigned long data) +static void tcp_delack_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { - inet_csk(sk)->icsk_ack.blocked = 1; + icsk->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) @@ -576,9 +578,11 @@ out: sk_mem_reclaim(sk); } -static void tcp_write_timer(unsigned long data) +static void tcp_write_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { @@ -613,9 +617,9 @@ void tcp_set_keepalive(struct sock *sk, int val) EXPORT_SYMBOL_GPL(tcp_set_keepalive); -static void tcp_keepalive_timer (unsigned long data) +static void tcp_keepalive_timer (struct timer_list *t) { - struct sock *sk = (struct sock *) data; + struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; -- cgit v1.2.3 From 78802011fbe34331bdef6f2dfb1634011f0e4c32 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:20 -0700 Subject: inet: frags: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Alexander Aring Cc: Stefan Schmidt Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: linux-wpan@vger.kernel.org Cc: netdev@vger.kernel.org Cc: netfilter-devel@vger.kernel.org Cc: coreteam@netfilter.org Signed-off-by: Kees Cook Acked-by: Stefan Schmidt # for ieee802154 Signed-off-by: David S. Miller --- net/ieee802154/6lowpan/reassembly.c | 5 +++-- net/ipv4/inet_fragment.c | 4 ++-- net/ipv4/ip_fragment.c | 5 +++-- net/ipv6/netfilter/nf_conntrack_reasm.c | 5 +++-- net/ipv6/reassembly.c | 5 +++-- 5 files changed, 14 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index f85b08baff16..85bf86ad6b18 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -80,12 +80,13 @@ static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) fq->daddr = *arg->dst; } -static void lowpan_frag_expire(unsigned long data) +static void lowpan_frag_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; struct net *net; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ieee802154_lowpan.frags); spin_lock(&fq->q.lock); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index af74d0433453..7f3ef5c287a1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -147,7 +147,7 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) spin_unlock(&hb->chain_lock); hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire((unsigned long) fq); + f->frag_expire(&fq->timer); return evicted; } @@ -366,7 +366,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, f->constructor(q, arg); add_frag_mem_limit(nf, f->qsize); - setup_timer(&q->timer, f->frag_expire, (unsigned long)q); + timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); refcount_set(&q->refcnt, 1); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 46408c220d9d..9215654a401f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -190,12 +190,13 @@ static bool frag_expire_skip_icmp(u32 user) /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ -static void ip_expire(unsigned long arg) +static void ip_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct ipq *qp; struct net *net; - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); rcu_read_lock(); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b263bf3a19f7..977d8900cfd1 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -169,12 +169,13 @@ static unsigned int nf_hashfn(const struct inet_frag_queue *q) return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); } -static void nf_ct_frag6_expire(unsigned long data) +static void nf_ct_frag6_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; struct net *net; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); ip6_expire_frag_queue(net, fq, &nf_frags); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 846012eae526..afbc000ad4f2 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -170,12 +170,13 @@ out: } EXPORT_SYMBOL(ip6_expire_frag_queue); -static void ip6_frag_expire(unsigned long data) +static void ip6_frag_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; struct net *net; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); ip6_expire_frag_queue(net, fq, &ip6_frags); -- cgit v1.2.3 From 9f12a77e467b8ec0296cf1c3481447e1b1811f59 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:21 -0700 Subject: net/core: Collapse redundant sk_timer callback data assignments The core sk_timer initializer can provide the common .data assignment instead of it being set separately in users. Cc: "David S. Miller" Cc: Ralf Baechle Cc: Andrew Hendry Cc: Eric Dumazet Cc: Paolo Abeni Cc: David Howells Cc: Colin Ian King Cc: Ingo Molnar Cc: linzhang Cc: netdev@vger.kernel.org Cc: linux-hams@vger.kernel.org Cc: linux-x25@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- net/netrom/nr_timer.c | 1 - net/rose/rose_timer.c | 1 - net/x25/af_x25.c | 1 - net/x25/x25_timer.c | 1 - 5 files changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 23953b741a41..f577df17bcf2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2683,7 +2683,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk_init_common(sk); sk->sk_send_head = NULL; - init_timer(&sk->sk_timer); + setup_timer(&sk->sk_timer, NULL, (unsigned long)sk); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 94d05806a9a2..f84ce71f1f5f 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -45,7 +45,6 @@ void nr_init_timers(struct sock *sk) setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk); /* initialized by sock_init_data */ - sk->sk_timer.data = (unsigned long)sk; sk->sk_timer.function = &nr_heartbeat_expiry; } diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index 3b89d66f15bb..e08201185214 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -36,7 +36,6 @@ void rose_start_heartbeat(struct sock *sk) { del_timer(&sk->sk_timer); - sk->sk_timer.data = (unsigned long)sk; sk->sk_timer.function = &rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index ac095936552d..c590c0bd1393 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -414,7 +414,6 @@ static void __x25_destroy_socket(struct sock *sk) /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; - sk->sk_timer.data = (unsigned long)sk; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index 5c5db1a36399..de5cec41d100 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -36,7 +36,6 @@ void x25_init_timers(struct sock *sk) setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk); /* initialized by sock_init_data */ - sk->sk_timer.data = (unsigned long)sk; sk->sk_timer.function = &x25_heartbeat_expiry; } -- cgit v1.2.3 From 99767f278ccf74a1857069bb3eec991e572f94cd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:36 -0700 Subject: net/core: Convert sk_timer users to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly for all users of sk_timer. Cc: "David S. Miller" Cc: Ralf Baechle Cc: Andrew Hendry Cc: Eric Dumazet Cc: Paolo Abeni Cc: David Howells Cc: Julia Lawall Cc: linzhang Cc: Ingo Molnar Cc: netdev@vger.kernel.org Cc: linux-hams@vger.kernel.org Cc: linux-x25@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- net/netrom/af_netrom.c | 6 +++--- net/netrom/nr_timer.c | 47 ++++++++++++++++++++++++----------------------- net/rose/rose_timer.c | 8 ++++---- net/x25/af_x25.c | 8 +++++--- net/x25/x25_timer.c | 17 +++++++++-------- 6 files changed, 46 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index f577df17bcf2..35656a9e4e44 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2683,7 +2683,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk_init_common(sk); sk->sk_send_head = NULL; - setup_timer(&sk->sk_timer, NULL, (unsigned long)sk); + timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index ebf16f7f9089..2dec3583c97d 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -241,9 +241,9 @@ void nr_destroy_socket(struct sock *); /* * Handler for deferred kills. */ -static void nr_destroy_timer(unsigned long data) +static void nr_destroy_timer(struct timer_list *t) { - struct sock *sk=(struct sock *)data; + struct sock *sk = from_timer(sk, t, sk_timer); bh_lock_sock(sk); sock_hold(sk); nr_destroy_socket(sk); @@ -284,7 +284,7 @@ void nr_destroy_socket(struct sock *sk) if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ - sk->sk_timer.function = nr_destroy_timer; + sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_destroy_timer; sk->sk_timer.expires = jiffies + 2 * HZ; add_timer(&sk->sk_timer); } else diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index f84ce71f1f5f..43569aea0f5e 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -29,23 +29,23 @@ #include #include -static void nr_heartbeat_expiry(unsigned long); -static void nr_t1timer_expiry(unsigned long); -static void nr_t2timer_expiry(unsigned long); -static void nr_t4timer_expiry(unsigned long); -static void nr_idletimer_expiry(unsigned long); +static void nr_heartbeat_expiry(struct timer_list *); +static void nr_t1timer_expiry(struct timer_list *); +static void nr_t2timer_expiry(struct timer_list *); +static void nr_t4timer_expiry(struct timer_list *); +static void nr_idletimer_expiry(struct timer_list *); void nr_init_timers(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); - setup_timer(&nr->t1timer, nr_t1timer_expiry, (unsigned long)sk); - setup_timer(&nr->t2timer, nr_t2timer_expiry, (unsigned long)sk); - setup_timer(&nr->t4timer, nr_t4timer_expiry, (unsigned long)sk); - setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk); + timer_setup(&nr->t1timer, nr_t1timer_expiry, 0); + timer_setup(&nr->t2timer, nr_t2timer_expiry, 0); + timer_setup(&nr->t4timer, nr_t4timer_expiry, 0); + timer_setup(&nr->idletimer, nr_idletimer_expiry, 0); /* initialized by sock_init_data */ - sk->sk_timer.function = &nr_heartbeat_expiry; + sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_heartbeat_expiry; } void nr_start_t1timer(struct sock *sk) @@ -112,9 +112,9 @@ int nr_t1timer_running(struct sock *sk) return timer_pending(&nr_sk(sk)->t1timer); } -static void nr_heartbeat_expiry(unsigned long param) +static void nr_heartbeat_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct sock *sk = from_timer(sk, t, sk_timer); struct nr_sock *nr = nr_sk(sk); bh_lock_sock(sk); @@ -151,10 +151,10 @@ static void nr_heartbeat_expiry(unsigned long param) bh_unlock_sock(sk); } -static void nr_t2timer_expiry(unsigned long param) +static void nr_t2timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; - struct nr_sock *nr = nr_sk(sk); + struct nr_sock *nr = from_timer(nr, t, t2timer); + struct sock *sk = &nr->sock; bh_lock_sock(sk); if (nr->condition & NR_COND_ACK_PENDING) { @@ -164,19 +164,20 @@ static void nr_t2timer_expiry(unsigned long param) bh_unlock_sock(sk); } -static void nr_t4timer_expiry(unsigned long param) +static void nr_t4timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct nr_sock *nr = from_timer(nr, t, t4timer); + struct sock *sk = &nr->sock; bh_lock_sock(sk); nr_sk(sk)->condition &= ~NR_COND_PEER_RX_BUSY; bh_unlock_sock(sk); } -static void nr_idletimer_expiry(unsigned long param) +static void nr_idletimer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; - struct nr_sock *nr = nr_sk(sk); + struct nr_sock *nr = from_timer(nr, t, idletimer); + struct sock *sk = &nr->sock; bh_lock_sock(sk); @@ -201,10 +202,10 @@ static void nr_idletimer_expiry(unsigned long param) bh_unlock_sock(sk); } -static void nr_t1timer_expiry(unsigned long param) +static void nr_t1timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; - struct nr_sock *nr = nr_sk(sk); + struct nr_sock *nr = from_timer(nr, t, t1timer); + struct sock *sk = &nr->sock; bh_lock_sock(sk); switch (nr->state) { diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index e08201185214..ea613b2a9735 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -28,7 +28,7 @@ #include #include -static void rose_heartbeat_expiry(unsigned long); +static void rose_heartbeat_expiry(struct timer_list *t); static void rose_timer_expiry(struct timer_list *); static void rose_idletimer_expiry(struct timer_list *); @@ -36,7 +36,7 @@ void rose_start_heartbeat(struct sock *sk) { del_timer(&sk->sk_timer); - sk->sk_timer.function = &rose_heartbeat_expiry; + sk->sk_timer.function = (TIMER_FUNC_TYPE)rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; add_timer(&sk->sk_timer); @@ -119,9 +119,9 @@ void rose_stop_idletimer(struct sock *sk) del_timer(&rose_sk(sk)->idletimer); } -static void rose_heartbeat_expiry(unsigned long param) +static void rose_heartbeat_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct sock *sk = from_timer(sk, t, sk_timer); struct rose_sock *rose = rose_sk(sk); bh_lock_sock(sk); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index c590c0bd1393..ea87143314f3 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -374,9 +374,11 @@ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ -static void x25_destroy_timer(unsigned long data) +static void x25_destroy_timer(struct timer_list *t) { - x25_destroy_socket_from_timer((struct sock *)data); + struct sock *sk = from_timer(sk, t, sk_timer); + + x25_destroy_socket_from_timer(sk); } /* @@ -413,7 +415,7 @@ static void __x25_destroy_socket(struct sock *sk) if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; - sk->sk_timer.function = x25_destroy_timer; + sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index de5cec41d100..1dfba3c23459 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -26,17 +26,17 @@ #include #include -static void x25_heartbeat_expiry(unsigned long); -static void x25_timer_expiry(unsigned long); +static void x25_heartbeat_expiry(struct timer_list *t); +static void x25_timer_expiry(struct timer_list *t); void x25_init_timers(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); - setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk); + timer_setup(&x25->timer, x25_timer_expiry, 0); /* initialized by sock_init_data */ - sk->sk_timer.function = &x25_heartbeat_expiry; + sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_heartbeat_expiry; } void x25_start_heartbeat(struct sock *sk) @@ -92,9 +92,9 @@ unsigned long x25_display_timer(struct sock *sk) return x25->timer.expires - jiffies; } -static void x25_heartbeat_expiry(unsigned long param) +static void x25_heartbeat_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct sock *sk = from_timer(sk, t, sk_timer); bh_lock_sock(sk); if (sock_owned_by_user(sk)) /* can currently only occur in state 3 */ @@ -159,9 +159,10 @@ static inline void x25_do_timer_expiry(struct sock * sk) } } -static void x25_timer_expiry(unsigned long param) +static void x25_timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct x25_sock *x25 = from_timer(x25, t, timer); + struct sock *sk = &x25->sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* can currently only occur in state 3 */ -- cgit v1.2.3 From ba421793505f91026cccbf9398ff866dd308036d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:37 -0700 Subject: net: atm: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Also drops a redundant initialization that is already set up by DEFINE_TIMER. Cc: "David S. Miller" Cc: Hans Liljestrand Cc: "Reshetova, Elena" Cc: Bhumika Goyal Cc: Johannes Berg Cc: Roopa Prabhu Cc: Augusto Mecking Caringi Cc: Jarod Wilson Cc: Kalle Valo Cc: Thomas Gleixner Cc: Alexey Dobriyan Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/atm/clip.c | 4 ++-- net/atm/lec.c | 19 +++++++++---------- net/atm/mpc.c | 1 - 3 files changed, 11 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/atm/clip.c b/net/atm/clip.c index 65f706e4344c..d4f6029d5109 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -153,7 +153,7 @@ static int neigh_check_cb(struct neighbour *n) return 1; } -static void idle_timer_check(unsigned long dummy) +static void idle_timer_check(struct timer_list *unused) { write_lock(&arp_tbl.lock); __neigh_for_each_release(&arp_tbl, neigh_check_cb); @@ -887,7 +887,7 @@ static int __init atm_clip_init(void) register_netdevice_notifier(&clip_dev_notifier); register_inetaddr_notifier(&clip_inet_notifier); - setup_timer(&idle_timer, idle_timer_check, 0); + timer_setup(&idle_timer, idle_timer_check, 0); #ifdef CONFIG_PROC_FS { diff --git a/net/atm/lec.c b/net/atm/lec.c index a3d93a1bb133..c976196da3ea 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1232,7 +1232,7 @@ static void lane2_associate_ind(struct net_device *dev, const u8 *mac_addr, #define LEC_ARP_REFRESH_INTERVAL (3*HZ) static void lec_arp_check_expire(struct work_struct *work); -static void lec_arp_expire_arp(unsigned long data); +static void lec_arp_expire_arp(struct timer_list *t); /* * Arp table funcs @@ -1559,8 +1559,7 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv, } ether_addr_copy(to_return->mac_addr, mac_addr); INIT_HLIST_NODE(&to_return->next); - setup_timer(&to_return->timer, lec_arp_expire_arp, - (unsigned long)to_return); + timer_setup(&to_return->timer, lec_arp_expire_arp, 0); to_return->last_used = jiffies; to_return->priv = priv; skb_queue_head_init(&to_return->tx_wait); @@ -1569,11 +1568,11 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv, } /* Arp sent timer expired */ -static void lec_arp_expire_arp(unsigned long data) +static void lec_arp_expire_arp(struct timer_list *t) { struct lec_arp_table *entry; - entry = (struct lec_arp_table *)data; + entry = from_timer(entry, t, timer); pr_debug("\n"); if (entry->status == ESI_ARP_PENDING) { @@ -1591,10 +1590,10 @@ static void lec_arp_expire_arp(unsigned long data) } /* Unknown/unused vcc expire, remove associated entry */ -static void lec_arp_expire_vcc(unsigned long data) +static void lec_arp_expire_vcc(struct timer_list *t) { unsigned long flags; - struct lec_arp_table *to_remove = (struct lec_arp_table *)data; + struct lec_arp_table *to_remove = from_timer(to_remove, t, timer); struct lec_priv *priv = to_remove->priv; del_timer(&to_remove->timer); @@ -1799,7 +1798,7 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv, else send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL); entry->timer.expires = jiffies + (1 * HZ); - entry->timer.function = lec_arp_expire_arp; + entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_arp; add_timer(&entry->timer); found = priv->mcast_vcc; } @@ -1999,7 +1998,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, entry->old_recv_push = old_push; entry->status = ESI_UNKNOWN; entry->timer.expires = jiffies + priv->vcc_timeout_period; - entry->timer.function = lec_arp_expire_vcc; + entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc; hlist_add_head(&entry->next, &priv->lec_no_forward); add_timer(&entry->timer); dump_arp_table(priv); @@ -2083,7 +2082,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, entry->status = ESI_UNKNOWN; hlist_add_head(&entry->next, &priv->lec_arp_empty_ones); entry->timer.expires = jiffies + priv->vcc_timeout_period; - entry->timer.function = lec_arp_expire_vcc; + entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc; add_timer(&entry->timer); pr_debug("After vcc was added\n"); dump_arp_table(priv); diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 5677147209e8..b43d99430eb6 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -799,7 +799,6 @@ static int atm_mpoa_mpoad_attach(struct atm_vcc *vcc, int arg) int err; if (mpcs == NULL) { - init_timer(&mpc_timer); mpc_timer_refresh(); /* This lets us now how our LECs are doing */ -- cgit v1.2.3 From 1ab791dc27faef5aee80fe76d73980d2de0bebc8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:41 -0700 Subject: ipv4: timewait: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b039159e67a..a4bab81f1462 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -142,9 +142,9 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -static void tw_timer_handler(unsigned long data) +static void tw_timer_handler(struct timer_list *t) { - struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; + struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); if (tw->tw_kill) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); @@ -188,8 +188,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); - setup_pinned_timer(&tw->tw_timer, tw_timer_handler, - (unsigned long)tw); + timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this -- cgit v1.2.3 From ff861c4d64f2df1c7eaabaf2ba8f2f8ebc4b28e3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:42 -0700 Subject: sunrpc: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Trond Myklebust Cc: Anna Schumaker Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: "David S. Miller" Cc: linux-nfs@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/sunrpc/sched.c | 8 ++++---- net/sunrpc/svc.c | 2 +- net/sunrpc/svc_xprt.c | 9 ++++----- net/sunrpc/xprt.c | 9 ++++----- 4 files changed, 13 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 0cc83839c13c..5dea47eb31bb 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -44,7 +44,7 @@ static mempool_t *rpc_buffer_mempool __read_mostly; static void rpc_async_schedule(struct work_struct *); static void rpc_release_task(struct rpc_task *task); -static void __rpc_queue_timer_fn(unsigned long ptr); +static void __rpc_queue_timer_fn(struct timer_list *t); /* * RPC tasks sit here while waiting for conditions to improve. @@ -228,7 +228,7 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c queue->maxpriority = nr_queues - 1; rpc_reset_waitqueue_priority(queue); queue->qlen = 0; - setup_timer(&queue->timer_list.timer, __rpc_queue_timer_fn, (unsigned long)queue); + timer_setup(&queue->timer_list.timer, __rpc_queue_timer_fn, 0); INIT_LIST_HEAD(&queue->timer_list.list); rpc_assign_waitqueue_name(queue, qname); } @@ -635,9 +635,9 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) } EXPORT_SYMBOL_GPL(rpc_wake_up_status); -static void __rpc_queue_timer_fn(unsigned long ptr) +static void __rpc_queue_timer_fn(struct timer_list *t) { - struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr; + struct rpc_wait_queue *queue = from_timer(queue, t, timer_list.timer); struct rpc_task *task, *n; unsigned long expires, now, timeo; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index aa04666f929d..33f4ae68426d 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -455,7 +455,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, serv->sv_xdrsize = xdrsize; INIT_LIST_HEAD(&serv->sv_tempsocks); INIT_LIST_HEAD(&serv->sv_permsocks); - init_timer(&serv->sv_temptimer); + timer_setup(&serv->sv_temptimer, NULL, 0); spin_lock_init(&serv->sv_lock); __svc_init_bc(serv); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d16a8b423c20..71de77bd4423 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -28,7 +28,7 @@ module_param(svc_rpc_per_connection_limit, uint, 0644); static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); -static void svc_age_temp_xprts(unsigned long closure); +static void svc_age_temp_xprts(struct timer_list *t); static void svc_delete_xprt(struct svc_xprt *xprt); /* apparently the "standard" is that clients close @@ -785,8 +785,7 @@ static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt serv->sv_tmpcnt++; if (serv->sv_temptimer.function == NULL) { /* setup timer to age temp transports */ - setup_timer(&serv->sv_temptimer, svc_age_temp_xprts, - (unsigned long)serv); + serv->sv_temptimer.function = (TIMER_FUNC_TYPE)svc_age_temp_xprts; mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } @@ -960,9 +959,9 @@ out: * Timer function to close old temporary transports, using * a mark-and-sweep algorithm. */ -static void svc_age_temp_xprts(unsigned long closure) +static void svc_age_temp_xprts(struct timer_list *t) { - struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_serv *serv = from_timer(serv, t, sv_temptimer); struct svc_xprt *xprt; struct list_head *le, *next; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e741ec2b4d8e..4b00302e1867 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -696,9 +696,9 @@ xprt_schedule_autodisconnect(struct rpc_xprt *xprt) } static void -xprt_init_autodisconnect(unsigned long data) +xprt_init_autodisconnect(struct timer_list *t) { - struct rpc_xprt *xprt = (struct rpc_xprt *)data; + struct rpc_xprt *xprt = from_timer(xprt, t, timer); spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv)) @@ -1422,10 +1422,9 @@ found: xprt->idle_timeout = 0; INIT_WORK(&xprt->task_cleanup, xprt_autoclose); if (xprt_has_timer(xprt)) - setup_timer(&xprt->timer, xprt_init_autodisconnect, - (unsigned long)xprt); + timer_setup(&xprt->timer, xprt_init_autodisconnect, 0); else - init_timer(&xprt->timer); + timer_setup(&xprt->timer, NULL, 0); if (strlen(args->servername) > RPC_MAXNETNAMELEN) { xprt_destroy(xprt); -- cgit v1.2.3 From 86e58cce939cefd806105922e4a0840e9a04a88f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 16 Oct 2017 15:11:22 -0500 Subject: decnet: af_decnet: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/decnet/af_decnet.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d4c9a8bbad3e..518cea17b811 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -630,10 +630,12 @@ static void dn_destroy_sock(struct sock *sk) goto disc_reject; case DN_RUN: scp->state = DN_DI; + /* fall through */ case DN_DI: case DN_DR: disc_reject: dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation); + /* fall through */ case DN_NC: case DN_NR: case DN_RJ: @@ -647,6 +649,7 @@ disc_reject: break; default: printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n"); + /* fall through */ case DN_O: dn_stop_slow_timer(sk); -- cgit v1.2.3 From fcfd6dfab97006d44c7db5d6c908eac383af6649 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 16 Oct 2017 15:48:55 -0500 Subject: ipv4: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in some cases I placed the "fall through" comment on its own line, which is what GCC is expecting to find. Addresses-Coverity-ID: 115108 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 3 ++- net/ipv4/arp.c | 1 + net/ipv4/devinet.c | 1 + net/ipv4/ipmr.c | 1 + net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 3 ++- net/ipv4/tcp_input.c | 2 ++ net/ipv4/tcp_ipv4.c | 3 ++- 7 files changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 43a1bbed7a42..ce4aa827be05 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -827,6 +827,7 @@ int inet_shutdown(struct socket *sock, int how) err = -ENOTCONN; /* Hack to wake up other listeners, who can poll for POLLHUP, even on eg. unconnected UDP sockets -- RR */ + /* fall through */ default: sk->sk_shutdown |= how; if (sk->sk_prot->shutdown) @@ -840,7 +841,7 @@ int inet_shutdown(struct socket *sock, int how) case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; - /* Fall through */ + /* fall through */ case TCP_SYN_SENT: err = sk->sk_prot->disconnect(sk, O_NONBLOCK); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7c45b8896709..a8d7c5a9fb05 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1180,6 +1180,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; + /* fall through */ case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6d9b072d903b..e1e2ec0525e6 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1522,6 +1522,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ + /* fall through */ case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b3ee01b0551b..40a43ad294cb 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1528,6 +1528,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; + /* fall through */ case MRT_ADD_MFC_PROXY: case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) { diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index a0f37b208268..0443ca4120b0 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -276,7 +276,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + /* Only ICMPs can be IP_CT_IS_REPLY: */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d0682ce2a5d6..b2390bfdc68f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2885,6 +2885,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ + /* fall through */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -6044,6 +6045,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; + /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5418ecf03b78..ecee4ddb24c5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1779,8 +1779,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; -- cgit v1.2.3 From 275757e6bae15a8621130907a78096afd1e15d2c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 16 Oct 2017 16:36:52 -0500 Subject: ipv6: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in some cases I placed the "fall through" comment on its own line, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv6/ah6.c | 1 + net/ipv6/exthdrs.c | 1 + net/ipv6/icmp.c | 6 ++---- net/ipv6/ip6_fib.c | 4 ++++ net/ipv6/ip6_tunnel.c | 1 + net/ipv6/ip6mr.c | 1 + net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 3 ++- net/ipv6/raw.c | 4 ++++ net/ipv6/tcp_ipv6.c | 3 ++- net/ipv6/xfrm6_policy.c | 1 + 10 files changed, 19 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 7802b72196f3..37bb33fbc742 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -271,6 +271,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) case NEXTHDR_DEST: if (dir == XFRM_POLICY_OUT) ipv6_rearrange_destopt(iph, exthdr.opth); + /* fall through */ case NEXTHDR_HOP: if (!zero_out_mutable_opts(exthdr.opth)) { net_dbg_ratelimited("overrun %sopts\n", diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 95516138e861..7835dea930b4 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -89,6 +89,7 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) */ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) break; + /* fall through */ case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); return false; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 4e52d52a6752..6ae5dd3f4d0d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -864,10 +864,8 @@ static int icmpv6_rcv(struct sk_buff *skb) goto discard_it; hdr = icmp6_hdr(skb); - /* - * Drop through to notify - */ - + /* to notify */ + /* fall through */ case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 548af48212fc..1ada9672d198 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1780,6 +1780,7 @@ static int fib6_walk_continue(struct fib6_walker *w) } w->state = FWS_L; #endif + /* fall through */ case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { @@ -1788,6 +1789,7 @@ static int fib6_walk_continue(struct fib6_walker *w) continue; } w->state = FWS_R; + /* fall through */ case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { @@ -1797,6 +1799,7 @@ static int fib6_walk_continue(struct fib6_walker *w) } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); + /* fall through */ case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; @@ -1815,6 +1818,7 @@ static int fib6_walk_continue(struct fib6_walker *w) } skip: w->state = FWS_U; + /* fall through */ case FWS_U: if (fn == w->root) return 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 825548680b1e..4212879ff35e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -593,6 +593,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case NDISC_REDIRECT: rel_type = ICMP_REDIRECT; rel_code = ICMP_REDIR_HOST; + /* fall through */ default: return 0; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index f5500f5444e9..59fad81e5f7a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1722,6 +1722,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; + /* fall through */ case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 46d6dba50698..1d2fb9267d6f 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -290,7 +290,8 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + /* Only ICMPs can be IP_CT_IS_REPLY: */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index e4462b0ff801..761a473a07c5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1055,6 +1055,7 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return ipv6_setsockopt(sk, level, optname, optval, optlen); } @@ -1077,6 +1078,7 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return compat_ipv6_setsockopt(sk, level, optname, optval, optlen); @@ -1138,6 +1140,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return ipv6_getsockopt(sk, level, optname, optval, optlen); } @@ -1160,6 +1163,7 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return compat_ipv6_getsockopt(sk, level, optname, optval, optlen); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 64d94afa427f..ae83615b7f6d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1577,8 +1577,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v6_timewait_ack(sk, skb); break; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 11d1314ab6c5..4ed9f8cc3b6a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -152,6 +152,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) switch (nexthdr) { case NEXTHDR_FRAGMENT: onlyproto = 1; + /* fall through */ case NEXTHDR_ROUTING: case NEXTHDR_HOP: case NEXTHDR_DEST: -- cgit v1.2.3 From d4f4da3e13ae7a405fa1da11ff4070588e700445 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 16 Oct 2017 16:53:16 -0500 Subject: net: ipx: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index ac598ec90589..d21a9d128d3e 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1867,6 +1867,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; + /* fall through */ case SIOCGIFADDR: rc = ipxitf_ioctl(cmd, argp); break; -- cgit v1.2.3 From 7de16e3a35578f4f5accc6f5f23970310483d0a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:53 -0700 Subject: bpf: split verifier and program ops struct bpf_verifier_ops contains both verifier ops and operations used later during program's lifetime (test_run). Split the runtime ops into a different structure. BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops to the names. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 4d88e0665c41..1dd3034f846f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4395,68 +4395,95 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -const struct bpf_verifier_ops sk_filter_prog_ops = { +const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -const struct bpf_verifier_ops tc_cls_act_prog_ops = { +const struct bpf_prog_ops sk_filter_prog_ops = { +}; + +const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, +}; + +const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops xdp_prog_ops = { +const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, +}; + +const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; -const struct bpf_verifier_ops cg_skb_prog_ops = { +const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_inout_prog_ops = { +const struct bpf_verifier_ops lwt_inout_verifier_ops = { .get_func_proto = lwt_inout_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_inout_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_xmit_prog_ops = { +const struct bpf_verifier_ops lwt_xmit_verifier_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, +}; + +const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops cg_sock_prog_ops = { +const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = sock_filter_convert_ctx_access, }; -const struct bpf_verifier_ops sock_ops_prog_ops = { +const struct bpf_prog_ops cg_sock_prog_ops = { +}; + +const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; -const struct bpf_verifier_ops sk_skb_prog_ops = { +const struct bpf_prog_ops sock_ops_prog_ops = { +}; + +const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; +const struct bpf_prog_ops sk_skb_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; -- cgit v1.2.3 From 4f9218aaf8a463f76cac40aa08d859d065f8cc9e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:55 -0700 Subject: bpf: move knowledge about post-translation offsets out of verifier Use the fact that verifier ops are now separate from program ops to define a separate set of callbacks for verification of already translated programs. Since we expect the analyzer ops to be defined only for a small subset of all program types initialize their array by hand (don't use linux/bpf_types.h). Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 1dd3034f846f..7373a08fbef7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3732,6 +3732,23 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } +static bool +tc_cls_act_is_valid_access_analyzer(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case offsetof(struct sk_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_data_end, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; +} + static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -3766,6 +3783,21 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } +static bool xdp_is_valid_access_analyzer(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case offsetof(struct xdp_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct xdp_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; +} + void bpf_warn_invalid_xdp_action(u32 act) { const u32 act_max = XDP_REDIRECT; @@ -4411,6 +4443,10 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .gen_prologue = tc_cls_act_prologue, }; +const struct bpf_verifier_ops tc_cls_act_analyzer_ops = { + .is_valid_access = tc_cls_act_is_valid_access_analyzer, +}; + const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; @@ -4421,6 +4457,10 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; +const struct bpf_verifier_ops xdp_analyzer_ops = { + .is_valid_access = xdp_is_valid_access_analyzer, +}; + const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; -- cgit v1.2.3 From 29d1b33a2e0a3288374102b004a15cb278a2303e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:56 -0700 Subject: bpf: allow access to skb->len from offloads Since we are now doing strict checking of what offloads may access, make sure skb->len is on that list. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 7373a08fbef7..09e011f20291 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3738,6 +3738,8 @@ tc_cls_act_is_valid_access_analyzer(int off, int size, struct bpf_insn_access_aux *info) { switch (off) { + case offsetof(struct sk_buff, len): + return true; case offsetof(struct sk_buff, data): info->reg_type = PTR_TO_PACKET; return true; -- cgit v1.2.3 From b9f1f1ce866c28e3d9b86202441b220244754a69 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Oct 2017 19:38:35 -0700 Subject: tcp: fix tcp_xmit_retransmit_queue() after rbtree introduction I tried to hard avoiding a call to rb_first() (via tcp_rtx_queue_head) in tcp_xmit_retransmit_queue(). But this was probably too bold. Quoting Yuchung : We might miss re-arming the RTO if tp->retransmit_skb_hint is not NULL. This can happen when RACK marks the first packet lost again and resets tp->retransmit_skb_hint for example (tcp_rack_mark_skb_lost()) Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") Signed-off-by: Eric Dumazet Reported-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6c74f2a39778..53dc1267c85e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2921,7 +2921,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb, *rtx_head = NULL, *hole = NULL; + struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); u32 max_segs; int mib_idx; @@ -2929,11 +2929,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!tp->packets_out) return; - skb = tp->retransmit_skb_hint; - if (!skb) { - rtx_head = tcp_rtx_queue_head(sk); - skb = rtx_head; - } + rtx_head = tcp_rtx_queue_head(sk); + skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); skb_rbtree_walk_from(skb) { __u8 sacked; -- cgit v1.2.3 From 22ce97fe49b5522e0f97b7c2282ed71a1abd7410 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Oct 2017 16:01:30 +0100 Subject: mqprio: fix potential null pointer dereference on opt The pointer opt has a null check however before for this check opt is dereferenced when len is initialized, hence we potentially have a null pointer deference on opt. Avoid this by checking for a null opt before dereferencing it. Detected by CoverityScan, CID#1458234 ("Dereference before null check") Fixes: 4e8b86c06269 ("mqprio: Introduce new hardware offload mode and shaper in mqprio") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/sched/sch_mqprio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index cae91b4b08a6..51c2b289c69b 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -142,7 +142,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) struct nlattr *tb[TCA_MQPRIO_MAX + 1]; struct nlattr *attr; int rem; - int len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); + int len; BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); @@ -164,6 +164,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) if (mqprio_parse_opt(dev, qopt)) return -EINVAL; + len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); if (len > 0) { err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, sizeof(*qopt)); -- cgit v1.2.3 From be070c77ca805a4f06f135599415195078f0ed68 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 17 Oct 2017 17:42:53 -0500 Subject: net: l2tp: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case I replaced the "NOBREAK" comment with a "fall through" comment, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 7135f4645d3a..f5179424eaf1 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -406,7 +406,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) goto nla_put_failure; - /* NOBREAK */ + /* fall through */ case L2TP_ENCAPTYPE_IP: #if IS_ENABLED(CONFIG_IPV6) if (np) { -- cgit v1.2.3 From c75e427d9349ec3e0059752cc784e1c301474c2d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 18 Oct 2017 10:48:25 +0300 Subject: tipc: checking for NULL instead of IS_ERR() The tipc_alloc_conn() function never returns NULL, it returns error pointers, so I have fixed the check. Fixes: 14c04493cb77 ("tipc: add ability to order and receive topology events in driver") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/tipc/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 713077536d0c..acaef80fb88c 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -504,7 +504,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, *(u32 *)&sub.usr_handle = port; con = tipc_alloc_conn(tipc_topsrv(net)); - if (!con) + if (IS_ERR(con)) return false; *conid = con->conid; -- cgit v1.2.3 From d18b4b35e310c5e30a3726309a93db8893cd3251 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Oct 2017 10:33:37 +0200 Subject: net: sched: cls_u32: use hash_ptr() for tc_u_hash After the change to the tp hash, we now get a build warning on 32-bit architectures: net/sched/cls_u32.c: In function 'tc_u_hash': net/sched/cls_u32.c:338:17: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] return hash_64((u64) tp->chain->block, U32_HASH_SHIFT); Using hash_ptr() instead of hash_64() lets us drop the cast and fixes the warning while still resulting in the same hash value. Fixes: 7fa9d974f3c2 ("net: sched: cls_u32: use block instead of q in tc_u_common") Signed-off-by: Arnd Bergmann Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index b6d46065f661..49d96b45a8ce 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -335,7 +335,7 @@ static struct hlist_head *tc_u_common_hash; static unsigned int tc_u_hash(const struct tcf_proto *tp) { - return hash_64((u64) tp->chain->block, U32_HASH_SHIFT); + return hash_ptr(tp->chain->block, U32_HASH_SHIFT); } static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) -- cgit v1.2.3 From 0843c092ee75bb375fbbb6b97a7c55e0069ae099 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 18 Oct 2017 18:38:08 +0300 Subject: net/sched: Set the net-device for egress device instance Currently the netdevice field is not set and the egdev instance is not functional, fix that. Fixes: 3f55bdda8df ('net: sched: introduce per-egress action device callbacks') Signed-off-by: Or Gerlitz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index ac97db92ab68..c67b820a8307 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1300,6 +1300,7 @@ tcf_action_egdev_get(const struct net_device *dev) if (!egdev) return NULL; INIT_LIST_HEAD(&egdev->cb_list); + egdev->dev = dev; tan = net_generic(dev_net(dev), tcf_action_net_id); rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node, tcf_action_egdev_ht_params); -- cgit v1.2.3 From f3d9832e56c48e4ca50bab0457e21bcaade4536d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 09:56:52 -0700 Subject: ipv6: addrconf: cleanup locking in ipv6_add_addr ipv6_add_addr is called in process context with rtnl lock held (e.g., manual config of an address) or during softirq processing (e.g., autoconf and address from a router advertisement). Currently, ipv6_add_addr calls rcu_read_lock_bh shortly after entry and does not call unlock until exit, minus the call around the address validator notifier. Similarly, addrconf_hash_lock is taken after the validator notifier and held until exit. This forces the allocation of inet6_ifaddr to always be atomic. Refactor ipv6_add_addr as follows: 1. add an input boolean to discriminate the call path (process context or softirq). This new flag controls whether the alloc can be done with GFP_KERNEL or GFP_ATOMIC. 2. Move the rcu_read_lock_bh and unlock calls only around functions that do rcu updates. 3. Remove the in6_dev_hold and put added by 3ad7d2468f79f ("Ipvlan should return an error when an address is already in use."). This was done presumably because rcu_read_unlock_bh needs to be called before calling the validator. Since rcu_read_lock is not needed before the validator runs revert the hold and put added by 3ad7d2468f79f and only do the hold when setting ifp->idev. 4. move duplicate address check and insertion of new address in the global address hash into a helper. The helper is called after an ifa is allocated and filled in. This allows the ifa for manually configured addresses to be done with GFP_KERNEL and reduces the overall amount of time with rcu_read_lock held and hash table spinlock held. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 104 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4603aa488f4f..a8d202b1b919 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -957,18 +957,43 @@ static u32 inet6_addr_hash(const struct in6_addr *addr) return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); } +static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) +{ + unsigned int hash; + int err = 0; + + spin_lock(&addrconf_hash_lock); + + /* Ignore adding duplicate addresses on an interface */ + if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev)) { + ADBG("ipv6_add_addr: already assigned\n"); + err = -EEXIST; + goto out; + } + + /* Add to big hash table */ + hash = inet6_addr_hash(&ifa->addr); + hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); + +out: + spin_unlock(&addrconf_hash_lock); + + return err; +} + /* On success it returns ifp with increased reference count */ static struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, const struct in6_addr *peer_addr, int pfxlen, - int scope, u32 flags, u32 valid_lft, u32 prefered_lft) + int scope, u32 flags, u32 valid_lft, u32 prefered_lft, + bool can_block) { + gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; - struct rt6_info *rt; + struct rt6_info *rt = NULL; struct in6_validator_info i6vi; - unsigned int hash; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -978,42 +1003,24 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, addr_type & IPV6_ADDR_LOOPBACK)) return ERR_PTR(-EADDRNOTAVAIL); - rcu_read_lock_bh(); - - in6_dev_hold(idev); - if (idev->dead) { err = -ENODEV; /*XXX*/ - goto out2; + goto out; } if (idev->cnf.disable_ipv6) { err = -EACCES; - goto out2; + goto out; } i6vi.i6vi_addr = *addr; i6vi.i6vi_dev = idev; - rcu_read_unlock_bh(); - err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi); - - rcu_read_lock_bh(); err = notifier_to_errno(err); - if (err) - goto out2; - - spin_lock(&addrconf_hash_lock); - - /* Ignore adding duplicate addresses on an interface */ - if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { - ADBG("ipv6_add_addr: already assigned\n"); - err = -EEXIST; + if (err < 0) goto out; - } - - ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + ifa = kzalloc(sizeof(*ifa), gfp_flags); if (!ifa) { ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; @@ -1023,6 +1030,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, rt = addrconf_dst_alloc(idev, addr, false); if (IS_ERR(rt)) { err = PTR_ERR(rt); + rt = NULL; goto out; } @@ -1053,16 +1061,21 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa->rt = rt; ifa->idev = idev; + in6_dev_hold(idev); + /* For caller */ refcount_set(&ifa->refcnt, 1); - /* Add to big hash table */ - hash = inet6_addr_hash(addr); + rcu_read_lock_bh(); - hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); - spin_unlock(&addrconf_hash_lock); + err = ipv6_add_addr_hash(idev->dev, ifa); + if (err < 0) { + rcu_read_unlock_bh(); + goto out; + } write_lock(&idev->lock); + /* Add to inet6_dev unicast addr list. */ ipv6_link_dev_addr(idev, ifa); @@ -1073,21 +1086,23 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, in6_ifa_hold(ifa); write_unlock(&idev->lock); -out2: + rcu_read_unlock_bh(); - if (likely(err == 0)) - inet6addr_notifier_call_chain(NETDEV_UP, ifa); - else { - kfree(ifa); - in6_dev_put(idev); + inet6addr_notifier_call_chain(NETDEV_UP, ifa); +out: + if (unlikely(err < 0)) { + if (rt) + ip6_rt_put(rt); + if (ifa) { + if (ifa->idev) + in6_dev_put(ifa->idev); + kfree(ifa); + } ifa = ERR_PTR(err); } return ifa; -out: - spin_unlock(&addrconf_hash_lock); - goto out2; } enum cleanup_prefix_rt_t { @@ -1334,7 +1349,7 @@ retry: ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft); + tmp_valid_lft, tmp_prefered_lft, true); if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -2018,7 +2033,7 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen, scope, flags, valid_lft, - preferred_lft); + preferred_lft, false); if (IS_ERR(ifp2)) goto lock_errdad; @@ -2476,7 +2491,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, pinfo->prefix_len, addr_type&IPV6_ADDR_SCOPE_MASK, addr_flags, valid_lft, - prefered_lft); + prefered_lft, false); if (IS_ERR_OR_NULL(ifp)) return -1; @@ -2845,7 +2860,7 @@ static int inet6_addr_add(struct net *net, int ifindex, } ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags, - valid_lft, prefered_lft); + valid_lft, prefered_lft, true); if (!IS_ERR(ifp)) { if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { @@ -2960,7 +2975,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifp = ipv6_add_addr(idev, addr, NULL, plen, scope, IFA_F_PERMANENT, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, + true); if (!IS_ERR(ifp)) { spin_lock_bh(&ifp->lock); ifp->flags &= ~IFA_F_TENTATIVE; @@ -3060,7 +3076,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev, #endif ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true); if (!IS_ERR(ifp)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); addrconf_dad_start(ifp); -- cgit v1.2.3 From ff7883ea60e7b021bcd6539b8211879554c8db9a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 09:56:53 -0700 Subject: net: ipv6: Make inet6addr_validator a blocking notifier inet6addr_validator chain was added by commit 3ad7d2468f79f ("Ipvlan should return an error when an address is already in use") to allow address validation before changes are committed and to be able to fail the address change with an error back to the user. The address validation is not done for addresses received from router advertisements. Handling RAs in softirq context is the only reason for the notifier chain to be atomic versus blocking. Since the only current user, ipvlan, of the validator chain ignores softirq context, the notifier can be made blocking and simply not invoked for softirq path. The blocking option is needed by spectrum for example to validate resources for an adding an address to an interface. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 21 ++++++++++++++------- net/ipv6/addrconf_core.c | 9 +++++---- 2 files changed, 19 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a8d202b1b919..dd9c0c435f71 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -993,7 +993,6 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; struct rt6_info *rt = NULL; - struct in6_validator_info i6vi; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -1013,12 +1012,20 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - i6vi.i6vi_addr = *addr; - i6vi.i6vi_dev = idev; - err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi); - err = notifier_to_errno(err); - if (err < 0) - goto out; + /* validator notifier needs to be blocking; + * do not call in atomic context + */ + if (can_block) { + struct in6_validator_info i6vi = { + .i6vi_addr = *addr, + .i6vi_dev = idev, + }; + + err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi); + err = notifier_to_errno(err); + if (err < 0) + goto out; + } ifa = kzalloc(sizeof(*ifa), gfp_flags); if (!ifa) { diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 9e3488d50b15..32b564dfd02a 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -88,7 +88,7 @@ int __ipv6_addr_type(const struct in6_addr *addr) EXPORT_SYMBOL(__ipv6_addr_type); static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); -static ATOMIC_NOTIFIER_HEAD(inet6addr_validator_chain); +static BLOCKING_NOTIFIER_HEAD(inet6addr_validator_chain); int register_inet6addr_notifier(struct notifier_block *nb) { @@ -110,19 +110,20 @@ EXPORT_SYMBOL(inet6addr_notifier_call_chain); int register_inet6addr_validator_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_register(&inet6addr_validator_chain, nb); + return blocking_notifier_chain_register(&inet6addr_validator_chain, nb); } EXPORT_SYMBOL(register_inet6addr_validator_notifier); int unregister_inet6addr_validator_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&inet6addr_validator_chain, nb); + return blocking_notifier_chain_unregister(&inet6addr_validator_chain, + nb); } EXPORT_SYMBOL(unregister_inet6addr_validator_notifier); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v) { - return atomic_notifier_call_chain(&inet6addr_validator_chain, val, v); + return blocking_notifier_call_chain(&inet6addr_validator_chain, val, v); } EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain); -- cgit v1.2.3 From de95e04791a03de5cb681980a3880db6919e3b4a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 09:56:54 -0700 Subject: net: Add extack to validator_info structs used for address notifier Add extack to in_validator_info and in6_validator_info. Update the one user of each, ipvlan, to return an error message for failures. Only manual configuration of an address is plumbed in the IPv6 code path. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 8 +++++--- net/ipv6/addrconf.c | 22 ++++++++++++---------- 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e1e2ec0525e6..a4573bccd6da 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -444,7 +444,7 @@ static void check_lifetime(struct work_struct *work); static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, - u32 portid) + u32 portid, struct netlink_ext_ack *extack) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1, **ifap, **last_primary; @@ -489,6 +489,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; + ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); @@ -521,7 +522,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, static int inet_insert_ifa(struct in_ifaddr *ifa) { - return __inet_insert_ifa(ifa, NULL, 0); + return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) @@ -902,7 +903,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, return ret; } } - return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); + return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, + extack); } else { inet_free_ifa(ifa); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index dd9c0c435f71..93f9c0a61911 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -987,7 +987,7 @@ static struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, const struct in6_addr *peer_addr, int pfxlen, int scope, u32 flags, u32 valid_lft, u32 prefered_lft, - bool can_block) + bool can_block, struct netlink_ext_ack *extack) { gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); @@ -1019,6 +1019,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, struct in6_validator_info i6vi = { .i6vi_addr = *addr, .i6vi_dev = idev, + .extack = extack, }; err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi); @@ -1356,7 +1357,7 @@ retry: ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft, true); + tmp_valid_lft, tmp_prefered_lft, true, NULL); if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -2040,7 +2041,7 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen, scope, flags, valid_lft, - preferred_lft, false); + preferred_lft, false, NULL); if (IS_ERR(ifp2)) goto lock_errdad; @@ -2498,7 +2499,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, pinfo->prefix_len, addr_type&IPV6_ADDR_SCOPE_MASK, addr_flags, valid_lft, - prefered_lft, false); + prefered_lft, false, NULL); if (IS_ERR_OR_NULL(ifp)) return -1; @@ -2808,7 +2809,8 @@ static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *pfx, const struct in6_addr *peer_pfx, unsigned int plen, __u32 ifa_flags, - __u32 prefered_lft, __u32 valid_lft) + __u32 prefered_lft, __u32 valid_lft, + struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; @@ -2867,7 +2869,7 @@ static int inet6_addr_add(struct net *net, int ifindex, } ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags, - valid_lft, prefered_lft, true); + valid_lft, prefered_lft, true, extack); if (!IS_ERR(ifp)) { if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { @@ -2952,7 +2954,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) rtnl_lock(); err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL, ireq.ifr6_prefixlen, IFA_F_PERMANENT, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL); rtnl_unlock(); return err; } @@ -2983,7 +2985,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifp = ipv6_add_addr(idev, addr, NULL, plen, scope, IFA_F_PERMANENT, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, - true); + true, NULL); if (!IS_ERR(ifp)) { spin_lock_bh(&ifp->lock); ifp->flags &= ~IFA_F_TENTATIVE; @@ -3083,7 +3085,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev, #endif ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL); if (!IS_ERR(ifp)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); addrconf_dad_start(ifp); @@ -4586,7 +4588,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, */ return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx, ifm->ifa_prefixlen, ifa_flags, - preferred_lft, valid_lft); + preferred_lft, valid_lft, extack); } if (nlh->nlmsg_flags & NLM_F_EXCL || -- cgit v1.2.3 From 1fba70e5b6bed53496ba1f1f16127f5be01b5fb6 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 18 Oct 2017 11:22:51 -0700 Subject: tcp: socket option to set TCP fast open key New socket option TCP_FASTOPEN_KEY to allow different keys per listener. The listener by default uses the global key until the socket option is set. The key is a 16 bytes long binary data. This option has no effect on regular non-listener TCP sockets. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 3 ++- net/ipv4/tcp.c | 33 +++++++++++++++++++++++++++ net/ipv4/tcp_fastopen.c | 56 +++++++++++++++++++++++++++++++++------------- net/ipv4/tcp_ipv4.c | 1 + 4 files changed, 76 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index cac8dd309f39..81d218346cf7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -284,7 +284,8 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ret = -EINVAL; goto bad_key; } - tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH); + tcp_fastopen_reset_cipher(net, NULL, user_key, + TCP_FASTOPEN_KEY_LENGTH); } bad_key: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3b34850d361f..8b1fa4dd4538 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2571,6 +2571,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + + if (optlen != sizeof(key)) + return -EINVAL; + + if (copy_from_user(key, optval, optlen)) + return -EFAULT; + + return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + } default: /* fallthru */ break; @@ -3157,6 +3168,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctx; + + if (get_user(len, optlen)) + return -EFAULT; + + rcu_read_lock(); + ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); + if (ctx) + memcpy(key, ctx->key, sizeof(key)); + else + len = 0; + rcu_read_unlock(); + + len = min_t(unsigned int, len, sizeof(key)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, key, len)) + return -EFAULT; + return 0; + } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 7ee4aadcdd71..21075ce19cb6 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -29,7 +29,7 @@ void tcp_fastopen_init_key_once(struct net *net) * for a valid cookie, so this is an acceptable risk. */ get_random_bytes(key, sizeof(key)); - tcp_fastopen_reset_cipher(net, key, sizeof(key)); + tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) @@ -40,6 +40,16 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) kfree(ctx); } +void tcp_fastopen_destroy_cipher(struct sock *sk) +{ + struct tcp_fastopen_context *ctx; + + ctx = rcu_dereference_protected( + inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1); + if (ctx) + call_rcu(&ctx->rcu, tcp_fastopen_ctx_free); +} + void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; @@ -55,10 +65,12 @@ void tcp_fastopen_ctx_destroy(struct net *net) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); } -int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len) +int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, + void *key, unsigned int len) { - int err; struct tcp_fastopen_context *ctx, *octx; + struct fastopen_queue *q; + int err; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -79,27 +91,39 @@ error: kfree(ctx); } memcpy(ctx->key, key, len); - spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); - octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); - spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + if (sk) { + q = &inet_csk(sk)->icsk_accept_queue.fastopenq; + spin_lock_bh(&q->lock); + octx = rcu_dereference_protected(q->ctx, + lockdep_is_held(&q->lock)); + rcu_assign_pointer(q->ctx, ctx); + spin_unlock_bh(&q->lock); + } else { + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + } if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } -static bool __tcp_fastopen_cookie_gen(struct net *net, - const void *path, +static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + + ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); + if (!ctx) + ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); + if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; @@ -115,7 +139,7 @@ static bool __tcp_fastopen_cookie_gen(struct net *net, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct net *net, +static bool tcp_fastopen_cookie_gen(struct sock *sk, struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) @@ -124,7 +148,7 @@ static bool tcp_fastopen_cookie_gen(struct net *net, const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(net, path, foc); + return __tcp_fastopen_cookie_gen(sk, path, foc); } #if IS_ENABLED(CONFIG_IPV6) @@ -132,13 +156,13 @@ static bool tcp_fastopen_cookie_gen(struct net *net, const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; - if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) { + if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(net, buf, foc); + return __tcp_fastopen_cookie_gen(sk, buf, foc); } } #endif @@ -313,7 +337,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) && + tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ecee4ddb24c5..28ca4e177047 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1893,6 +1893,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_fastopen_destroy_cipher(sk); tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); -- cgit v1.2.3 From 6eba87c781aaa02f6bf1b64df2f8b12833eee521 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 11:39:13 -0700 Subject: net: ipv4: Change fib notifiers to take a fib_alias All of the notifier data (fib_info, tos, type and table id) are contained in the fib_alias. Pass it to the notifier instead of each data separately shortening the argument list by 3. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c636650a6a70..aaa1ba09afaa 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -87,32 +87,30 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa) { struct fib_entry_notifier_info info = { .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, net, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa) { struct fib_entry_notifier_info info = { .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } @@ -1216,9 +1214,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, fi, - new_fa->fa_tos, cfg->fc_type, - tb->tb_id); + key, plen, new_fa); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1273,8 +1269,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type, - tb->tb_id); + call_fib_entry_notifiers(net, event, key, plen, new_fa); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1574,8 +1569,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete->fa_info, tos, - fa_to_delete->fa_type, tb->tb_id); + fa_to_delete); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1892,9 +1886,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb) call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); + KEYLENGTH - fa->fa_slen, fa); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -1932,8 +1924,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, continue; call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, - fa->fa_type, fa->tb_id); + KEYLENGTH - fa->fa_slen, fa); } } -- cgit v1.2.3 From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:22 -0700 Subject: bpf: Add file mode configuration into bpf maps Introduce the map read/write flags to the eBPF syscalls that returns the map fd. The flags is used to set up the file mode when construct a new file descriptor for bpf maps. To not break the backward capability, the f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise it should be O_RDONLY or O_WRONLY. When the userspace want to modify or read the map content, it will check the file mode to see if it is allowed to make the change. Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/netfilter/xt_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 29123934887b..041da0d9c06f 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -56,7 +56,7 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) int retval, fd; set_fs(KERNEL_DS); - fd = bpf_obj_get_user(path); + fd = bpf_obj_get_user(path, 0); set_fs(oldfs); if (fd < 0) return fd; -- cgit v1.2.3 From b886d5f2f2906a866e33212734d204dfe35d50d9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 16:07:10 +0200 Subject: ipv6: start fib6 gc on RTF_CACHE dst creation After the commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache"), the fib6 gc is not started after the creation of a RTF_CACHE via a redirect or pmtu update, since fib6_add() isn't invoked anymore for such dsts. We need the fib6 gc to run periodically to clean the RTF_CACHE, or the dst will stay there forever. Fix it by explicitly calling fib6_force_start_gc() on successful exception creation. gc_args->more accounting will ensure that the gc timer will run for whatever time needed to properly clean the table. v2 -> v3: - clarified the commit message Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Paolo Abeni Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2e8842fa6450..efecdcff5055 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1340,8 +1340,10 @@ out: spin_unlock_bh(&rt6_exception_lock); /* Update fn->fn_sernum to invalidate all cached dst */ - if (!err) + if (!err) { fib6_update_sernum(ort); + fib6_force_start_gc(net); + } return err; } -- cgit v1.2.3 From 1859bac04fb696b858dbbff59503c945ec871bd9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 16:07:11 +0200 Subject: ipv6: remove from fib tree aged out RTF_CACHE dst The commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") partially reverted the commit 1e2ea8ad37be ("ipv6: set dst.obsolete when a cached route has expired"). As a result, RTF_CACHE dst referenced outside the fib tree will not be removed until the next sernum change; dst_check() does not fail on aged-out dst, and dst->__refcnt can't decrease: the aged out dst will stay valid for a potentially unlimited time after the timeout expiration. This change explicitly removes RTF_CACHE dst from the fib tree when aged out. The rt6_remove_exception() logic will then obsolete the dst and other entities will drop the related reference on next dst_check(). pMTU exceptions are not aged-out, and are removed from the exception table only when the - usually considerably longer - ip6_rt_mtu_expires timeout expires. v1 -> v2: - do not touch dst.obsolete in rt6_remove_exception(), not needed v2 -> v3: - take care of pMTU exceptions, too Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Paolo Abeni Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index efecdcff5055..074fac966018 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1575,7 +1575,13 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, { struct rt6_info *rt = rt6_ex->rt6i; - if (atomic_read(&rt->dst.__refcnt) == 1 && + /* we are pruning and obsoleting aged-out and non gateway exceptions + * even if others have still references to them, so that on next + * dst_check() such references can be dropped. + * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when + * expired, independently from their aging, as per RFC 8201 section 4 + */ + if (!(rt->rt6i_flags & RTF_EXPIRES) && time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { RT6_TRACE("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); @@ -1595,6 +1601,10 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, rt6_remove_exception(bucket, rt6_ex); return; } + } else if (__rt6_check_expired(rt)) { + RT6_TRACE("purging expired route %p\n", rt); + rt6_remove_exception(bucket, rt6_ex); + return; } gc_args->more++; } -- cgit v1.2.3 From ba233b34741a1dc88d1e94db7deeb7b079ef4b9a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Oct 2017 14:20:30 -0700 Subject: tcp: fix tcp_send_syn_data() syn_data was allocated by sk_stream_alloc_skb(), meaning its destructor and _skb_refdst fields are mangled. We need to call tcp_skb_tsorted_anchor_cleanup() before calling kfree_skb() or kernel crashes. Bug was reported by syzkaller bot. Fixes: e2080072ed2d ("tcp: new list for sent but unacked skbs for RACK recovery") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 53dc1267c85e..988733f289c8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3383,6 +3383,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) int copied = copy_from_iter(skb_put(syn_data, space), space, &fo->data->msg_iter); if (unlikely(!copied)) { + tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } -- cgit v1.2.3 From 164a5e7ad531e181334a3d3f03d0d5ad20d6faea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Oct 2017 17:02:03 -0700 Subject: ipv4: ipv4_default_advmss() should use route mtu ipv4_default_advmss() incorrectly uses the device MTU instead of the route provided one. IPv6 has the proper behavior, lets harmonize the two protocols. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4306db827374..bc40bd411196 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1250,7 +1250,7 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size, + unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, ip_rt_min_advmss); return min(advmss, IPV4_MAX_PMTU - header_size); -- cgit v1.2.3 From b65f164d37cf6d4aac59b0e13c2e5c4cfe293fd2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 09:31:43 +0200 Subject: ipv6: let trace_fib6_table_lookup() dereference the fib table The perf traces for ipv6 routing code show a relevant cost around trace_fib6_table_lookup(), even if no trace is enabled. This is due to the fib6_table de-referencing currently performed by the caller. Let's the tracing code pay this overhead, passing to the trace helper the table pointer. This gives small but measurable performance improvement under UDP flood. Signed-off-by: Paolo Abeni Acked-by: Steven Rostedt (VMware) Acked-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 074fac966018..46c59a53c53f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -945,7 +945,7 @@ restart: rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; @@ -1682,7 +1682,7 @@ redo_rt6_select: if (rt == net->ipv6.ip6_null_entry) { rcu_read_unlock(); dst_hold(&rt->dst); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (rt->rt6i_flags & RTF_CACHE) { if (ip6_hold_safe(net, &rt, true)) { @@ -1690,7 +1690,7 @@ redo_rt6_select: rt6_dst_from_metrics_check(rt); } rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !(rt->rt6i_flags & RTF_GATEWAY))) { @@ -1726,7 +1726,7 @@ redo_rt6_select: } uncached_rt_out: - trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, uncached_rt, table, fl6); return uncached_rt; } else { @@ -1754,7 +1754,7 @@ uncached_rt_out: } local_bh_enable(); rcu_read_unlock(); - trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, pcpu_rt, table, fl6); return pcpu_rt; } } @@ -2195,7 +2195,7 @@ out: rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; }; -- cgit v1.2.3 From 8c4083b30e56fc71b0e94c26374b32d95d5ea461 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:29 +0200 Subject: net: sched: add block bind/unbind notif. and extended block_get/put Introduce new type of ndo_setup_tc message to propage binding/unbinding of a block to driver. Call this ndo whenever qdisc gets/puts a block. Alongside with this, there's need to propagate binder type from qdisc code down to the notifier. So introduce extended variants of block_get/put in order to pass this info. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2e8e87fd9d97..92dce26d10e3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -240,8 +240,36 @@ tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain, chain->p_filter_chain = p_filter_chain; } -int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) +static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei, + enum tc_block_command command) +{ + struct net_device *dev = q->dev_queue->dev; + struct tc_block_offload bo = {}; + + if (!tc_can_offload(dev)) + return; + bo.command = command; + bo.binder_type = ei->binder_type; + bo.block = block; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); +} + +static void tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ + tcf_block_offload_cmd(block, q, ei, TC_BLOCK_BIND); +} + +static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ + tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND); +} + +int tcf_block_get_ext(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei) { struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); struct tcf_chain *chain; @@ -259,6 +287,7 @@ int tcf_block_get(struct tcf_block **p_block, tcf_chain_filter_chain_ptr_set(chain, p_filter_chain); block->net = qdisc_net(q); block->q = q; + tcf_block_offload_bind(block, q, ei); *p_block = block; return 0; @@ -266,15 +295,28 @@ err_chain_create: kfree(block); return err; } +EXPORT_SYMBOL(tcf_block_get_ext); + +int tcf_block_get(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) +{ + struct tcf_block_ext_info ei = {0, }; + + return tcf_block_get_ext(p_block, p_filter_chain, q, &ei); +} EXPORT_SYMBOL(tcf_block_get); -void tcf_block_put(struct tcf_block *block) +void tcf_block_put_ext(struct tcf_block *block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei) { struct tcf_chain *chain, *tmp; if (!block) return; + tcf_block_offload_unbind(block, q, ei); + /* XXX: Standalone actions are not allowed to jump to any chain, and * bound actions should be all removed after flushing. However, * filters are destroyed in RCU callbacks, we have to hold the chains @@ -302,6 +344,14 @@ void tcf_block_put(struct tcf_block *block) tcf_chain_put(chain); kfree(block); } +EXPORT_SYMBOL(tcf_block_put_ext); + +void tcf_block_put(struct tcf_block *block) +{ + struct tcf_block_ext_info ei = {0, }; + + tcf_block_put_ext(block, NULL, block->q, &ei); +} EXPORT_SYMBOL(tcf_block_put); /* Main classifier routine: scans classifier chain attached -- cgit v1.2.3 From 6e40cf2d4dee9dc22ff398041ce876bef8172dea Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:30 +0200 Subject: net: sched: use extended variants of block_get/put in ingress and clsact qdiscs Use previously introduced extended variants of block get and put functions. This allows to specify a binder types specific to clsact ingress/egress which is useful for drivers to distinguish who actually got the block. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_ingress.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 9ccc1b89b0d9..b599db26d34b 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -20,6 +20,7 @@ struct ingress_sched_data { struct tcf_block *block; + struct tcf_block_ext_info block_info; }; static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) @@ -59,7 +60,10 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; - err = tcf_block_get(&q->block, &dev->ingress_cl_list, sch); + q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + + err = tcf_block_get_ext(&q->block, &dev->ingress_cl_list, + sch, &q->block_info); if (err) return err; @@ -72,8 +76,10 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); - tcf_block_put(q->block); + tcf_block_put_ext(q->block, &dev->ingress_cl_list, + sch, &q->block_info); net_dec_ingress_queue(); } @@ -114,6 +120,8 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { struct clsact_sched_data { struct tcf_block *ingress_block; struct tcf_block *egress_block; + struct tcf_block_ext_info ingress_block_info; + struct tcf_block_ext_info egress_block_info; }; static unsigned long clsact_find(struct Qdisc *sch, u32 classid) @@ -153,13 +161,19 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; - err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list, sch); + q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + + err = tcf_block_get_ext(&q->ingress_block, &dev->ingress_cl_list, + sch, &q->ingress_block_info); if (err) return err; - err = tcf_block_get(&q->egress_block, &dev->egress_cl_list, sch); + q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS; + + err = tcf_block_get_ext(&q->egress_block, &dev->egress_cl_list, + sch, &q->egress_block_info); if (err) - return err; + goto err_egress_block_get; net_inc_ingress_queue(); net_inc_egress_queue(); @@ -167,14 +181,22 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) sch->flags |= TCQ_F_CPUSTATS; return 0; + +err_egress_block_get: + tcf_block_put_ext(q->ingress_block, &dev->ingress_cl_list, + sch, &q->ingress_block_info); + return err; } static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); - tcf_block_put(q->egress_block); - tcf_block_put(q->ingress_block); + tcf_block_put_ext(q->egress_block, &dev->egress_cl_list, + sch, &q->egress_block_info); + tcf_block_put_ext(q->ingress_block, &dev->ingress_cl_list, + sch, &q->ingress_block_info); net_dec_ingress_queue(); net_dec_egress_queue(); -- cgit v1.2.3 From acb674428c3d57bccbe3f4a1a7a009f6d73e9f41 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:31 +0200 Subject: net: sched: introduce per-block callbacks Introduce infrastructure that allows drivers to register callbacks that are called whenever tc would offload inserted rule for a specific block. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 92dce26d10e3..b16c79c47be5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -278,6 +278,8 @@ int tcf_block_get_ext(struct tcf_block **p_block, if (!block) return -ENOMEM; INIT_LIST_HEAD(&block->chain_list); + INIT_LIST_HEAD(&block->cb_list); + /* Create chain 0 by default, it has to be always present. */ chain = tcf_chain_create(block, 0); if (!chain) { @@ -354,6 +356,109 @@ void tcf_block_put(struct tcf_block *block) } EXPORT_SYMBOL(tcf_block_put); +struct tcf_block_cb { + struct list_head list; + tc_setup_cb_t *cb; + void *cb_ident; + void *cb_priv; + unsigned int refcnt; +}; + +void *tcf_block_cb_priv(struct tcf_block_cb *block_cb) +{ + return block_cb->cb_priv; +} +EXPORT_SYMBOL(tcf_block_cb_priv); + +struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident) +{ struct tcf_block_cb *block_cb; + + list_for_each_entry(block_cb, &block->cb_list, list) + if (block_cb->cb == cb && block_cb->cb_ident == cb_ident) + return block_cb; + return NULL; +} +EXPORT_SYMBOL(tcf_block_cb_lookup); + +void tcf_block_cb_incref(struct tcf_block_cb *block_cb) +{ + block_cb->refcnt++; +} +EXPORT_SYMBOL(tcf_block_cb_incref); + +unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) +{ + return --block_cb->refcnt; +} +EXPORT_SYMBOL(tcf_block_cb_decref); + +struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv) +{ + struct tcf_block_cb *block_cb; + + block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); + if (!block_cb) + return NULL; + block_cb->cb = cb; + block_cb->cb_ident = cb_ident; + block_cb->cb_priv = cb_priv; + list_add(&block_cb->list, &block->cb_list); + return block_cb; +} +EXPORT_SYMBOL(__tcf_block_cb_register); + +int tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv) +{ + struct tcf_block_cb *block_cb; + + block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv); + return block_cb ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(tcf_block_cb_register); + +void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +{ + list_del(&block_cb->list); + kfree(block_cb); +} +EXPORT_SYMBOL(__tcf_block_cb_unregister); + +void tcf_block_cb_unregister(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident) +{ + struct tcf_block_cb *block_cb; + + block_cb = tcf_block_cb_lookup(block, cb, cb_ident); + if (!block_cb) + return; + __tcf_block_cb_unregister(block_cb); +} +EXPORT_SYMBOL(tcf_block_cb_unregister); + +static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type, + void *type_data, bool err_stop) +{ + struct tcf_block_cb *block_cb; + int ok_count = 0; + int err; + + list_for_each_entry(block_cb, &block->cb_list, list) { + err = block_cb->cb(type, type_data, block_cb->cb_priv); + if (err) { + if (err_stop) + return err; + } else { + ok_count++; + } + } + return ok_count; +} + /* Main classifier routine: scans classifier chain attached * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. -- cgit v1.2.3 From 208c0f4b5237f1d6611b2c679a8022d6901577d6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:32 +0200 Subject: net: sched: use tc_setup_cb_call to call per-block callbacks Extend the tc_setup_cb_call entrypoint function originally used only for action egress devices callbacks to call per-block callbacks as well. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 21 ++++++++++++++++++--- net/sched/cls_flower.c | 9 ++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b16c79c47be5..cdfdc24b89cf 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1206,10 +1206,25 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, return ok_count; } -int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, - void *type_data, bool err_stop) +int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts, + enum tc_setup_type type, void *type_data, bool err_stop) { - return tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop); + int ok_count; + int ret; + + ret = tcf_block_cb_call(block, type, type_data, err_stop); + if (ret < 0) + return ret; + ok_count = ret; + + if (!exts) + return ok_count; + ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop); + if (ret < 0) + return ret; + ok_count += ret; + + return ok_count; } EXPORT_SYMBOL(tc_setup_cb_call); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 5b7bb968d1d4..76b4e0a1c92f 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -201,6 +201,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_DESTROY; @@ -209,7 +210,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) if (tc_can_offload(dev)) dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); - tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, false); } @@ -220,6 +221,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = tp->chain->block; bool skip_sw = tc_skip_sw(f->flags); int err; @@ -242,7 +244,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, } } - err = tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw); if (err < 0) { fl_hw_destroy_filter(tp, f); @@ -261,6 +263,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_STATS; @@ -270,7 +273,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) if (tc_can_offload(dev)) dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); - tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, false); } -- cgit v1.2.3 From 2447a96f88ee3c082603c2dd38ae51f66977c11d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:33 +0200 Subject: net: sched: cls_matchall: call block callbacks for offload Use the newly introduced callbacks infrastructure and call block callbacks alongside with the existing per-netdev ndo_setup_tc. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 72 ++++++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index eeac606c95ab..5278534c7e87 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -50,50 +50,73 @@ static void mall_destroy_rcu(struct rcu_head *rcu) kfree(head); } -static int mall_replace_hw_filter(struct tcf_proto *tp, - struct cls_mall_head *head, - unsigned long cookie) +static void mall_destroy_hw_filter(struct tcf_proto *tp, + struct cls_mall_head *head, + unsigned long cookie) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_matchall_offload cls_mall = {}; - int err; + struct tcf_block *block = tp->chain->block; tc_cls_common_offload_init(&cls_mall.common, tp); - cls_mall.command = TC_CLSMATCHALL_REPLACE; - cls_mall.exts = &head->exts; + cls_mall.command = TC_CLSMATCHALL_DESTROY; cls_mall.cookie = cookie; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, - &cls_mall); - if (!err) - head->flags |= TCA_CLS_FLAGS_IN_HW; - - return err; + if (tc_can_offload(dev)) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, + &cls_mall); + tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false); } -static void mall_destroy_hw_filter(struct tcf_proto *tp, - struct cls_mall_head *head, - unsigned long cookie) +static int mall_replace_hw_filter(struct tcf_proto *tp, + struct cls_mall_head *head, + unsigned long cookie) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_matchall_offload cls_mall = {}; + struct tcf_block *block = tp->chain->block; + bool skip_sw = tc_skip_sw(head->flags); + int err; tc_cls_common_offload_init(&cls_mall.common, tp); - cls_mall.command = TC_CLSMATCHALL_DESTROY; + cls_mall.command = TC_CLSMATCHALL_REPLACE; + cls_mall.exts = &head->exts; cls_mall.cookie = cookie; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, &cls_mall); + if (tc_can_offload(dev)) { + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, + &cls_mall); + if (err) { + if (skip_sw) + return err; + } else { + head->flags |= TCA_CLS_FLAGS_IN_HW; + } + } + + err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, + &cls_mall, skip_sw); + if (err < 0) { + mall_destroy_hw_filter(tp, head, cookie); + return err; + } else if (err > 0) { + head->flags |= TCA_CLS_FLAGS_IN_HW; + } + + if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; + + return 0; } static void mall_destroy(struct tcf_proto *tp) { struct cls_mall_head *head = rtnl_dereference(tp->root); - struct net_device *dev = tp->q->dev_queue->dev; if (!head) return; - if (tc_should_offload(dev, head->flags)) + if (!tc_skip_hw(head->flags)) mall_destroy_hw_filter(tp, head, (unsigned long) head); call_rcu(&head->rcu, mall_destroy_rcu); @@ -133,7 +156,6 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, void **arg, bool ovr) { struct cls_mall_head *head = rtnl_dereference(tp->root); - struct net_device *dev = tp->q->dev_queue->dev; struct nlattr *tb[TCA_MATCHALL_MAX + 1]; struct cls_mall_head *new; u32 flags = 0; @@ -173,14 +195,10 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (err) goto err_set_parms; - if (tc_should_offload(dev, flags)) { + if (!tc_skip_hw(new->flags)) { err = mall_replace_hw_filter(tp, new, (unsigned long) new); - if (err) { - if (tc_skip_sw(flags)) - goto err_replace_hw_filter; - else - err = 0; - } + if (err) + goto err_replace_hw_filter; } if (!tc_in_hw(new->flags)) -- cgit v1.2.3 From 77460411929d8904823419fca73bdad666a659d3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:34 +0200 Subject: net: sched: cls_u32: swap u32_remove_hw_knode and u32_remove_hw_hnode Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 49d96b45a8ce..c3ea684b8e91 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -462,7 +462,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } -static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) +static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_u32_offload cls_u32 = {}; @@ -471,8 +471,10 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) return; tc_cls_common_offload_init(&cls_u32.common, tp); - cls_u32.command = TC_CLSU32_DELETE_KNODE; - cls_u32.knode.handle = handle; + cls_u32.command = TC_CLSU32_DELETE_HNODE; + cls_u32.hnode.divisor = h->divisor; + cls_u32.hnode.handle = h->handle; + cls_u32.hnode.prio = h->prio; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); } @@ -500,7 +502,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, return 0; } -static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) +static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_u32_offload cls_u32 = {}; @@ -509,10 +511,8 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) return; tc_cls_common_offload_init(&cls_u32.common, tp); - cls_u32.command = TC_CLSU32_DELETE_HNODE; - cls_u32.hnode.divisor = h->divisor; - cls_u32.hnode.handle = h->handle; - cls_u32.hnode.prio = h->prio; + cls_u32.command = TC_CLSU32_DELETE_KNODE; + cls_u32.knode.handle = handle; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); } -- cgit v1.2.3 From 245dc5121a9bf6a0a12ac1e72f47822fc3fa8cae Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:35 +0200 Subject: net: sched: cls_u32: call block callbacks for offload Use the newly introduced callbacks infrastructure and call block callbacks alongside with the existing per-netdev ndo_setup_tc. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 72 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index c3ea684b8e91..d53da7968eda 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -465,39 +465,57 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) { struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; - if (!tc_should_offload(dev, 0)) - return; - tc_cls_common_offload_init(&cls_u32.common, tp); cls_u32.command = TC_CLSU32_DELETE_HNODE; cls_u32.hnode.divisor = h->divisor; cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); + if (tc_can_offload(dev)) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); + tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); } static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; + bool skip_sw = tc_skip_sw(flags); + bool offloaded = false; int err; - if (!tc_should_offload(dev, flags)) - return tc_skip_sw(flags) ? -EINVAL : 0; - tc_cls_common_offload_init(&cls_u32.common, tp); cls_u32.command = TC_CLSU32_NEW_HNODE; cls_u32.hnode.divisor = h->divisor; cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); - if (tc_skip_sw(flags)) + if (tc_can_offload(dev)) { + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, + &cls_u32); + if (err) { + if (skip_sw) + return err; + } else { + offloaded = true; + } + } + + err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); + if (err < 0) { + u32_clear_hw_hnode(tp, h); return err; + } else if (err > 0) { + offloaded = true; + } + + if (skip_sw && !offloaded) + return -EINVAL; return 0; } @@ -505,28 +523,27 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) { struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; - if (!tc_should_offload(dev, 0)) - return; - tc_cls_common_offload_init(&cls_u32.common, tp); cls_u32.command = TC_CLSU32_DELETE_KNODE; cls_u32.knode.handle = handle; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); + if (tc_can_offload(dev)) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); + tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); } static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; + bool skip_sw = tc_skip_sw(flags); int err; - if (!tc_should_offload(dev, flags)) - return tc_skip_sw(flags) ? -EINVAL : 0; - tc_cls_common_offload_init(&cls_u32.common, tp); cls_u32.command = TC_CLSU32_REPLACE_KNODE; cls_u32.knode.handle = n->handle; @@ -543,13 +560,28 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, if (n->ht_down) cls_u32.knode.link_handle = n->ht_down->handle; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); - if (!err) - n->flags |= TCA_CLS_FLAGS_IN_HW; + if (tc_can_offload(dev)) { + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, + &cls_u32); + if (err) { + if (skip_sw) + return err; + } else { + n->flags |= TCA_CLS_FLAGS_IN_HW; + } + } - if (tc_skip_sw(flags)) + err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); + if (err < 0) { + u32_remove_hw_knode(tp, n->handle); return err; + } else if (err > 0) { + n->flags |= TCA_CLS_FLAGS_IN_HW; + } + + if (skip_sw && !(n->flags && TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; return 0; } -- cgit v1.2.3 From 3f7889c4c79b0afc012026431d4db4bad4e84473 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:36 +0200 Subject: net: sched: cls_bpf: call block callbacks for offload Use the newly introduced callbacks infrastructure and call block callbacks alongside with the existing per-netdev ndo_setup_tc. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6c6b21f6ba62..e379fdf928bd 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -147,7 +147,10 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, enum tc_clsbpf_command cmd) { + bool addorrep = cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE; struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; + bool skip_sw = tc_skip_sw(prog->gen_flags); struct tc_cls_bpf_offload cls_bpf = {}; int err; @@ -159,17 +162,38 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, cls_bpf.exts_integrated = prog->exts_integrated; cls_bpf.gen_flags = prog->gen_flags; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF, &cls_bpf); - if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE)) - prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; + if (tc_can_offload(dev)) { + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF, + &cls_bpf); + if (addorrep) { + if (err) { + if (skip_sw) + return err; + } else { + prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; + } + } + } + + err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); + if (addorrep) { + if (err < 0) { + cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); + return err; + } else if (err > 0) { + prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; + } + } - return err; + if (addorrep && skip_sw && !(prog->gen_flags && TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; + + return 0; } static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog) { - struct net_device *dev = tp->q->dev_queue->dev; struct cls_bpf_prog *obj = prog; enum tc_clsbpf_command cmd; bool skip_sw; @@ -179,7 +203,7 @@ static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, (oldprog && tc_skip_sw(oldprog->gen_flags)); if (oldprog && oldprog->offloaded) { - if (tc_should_offload(dev, prog->gen_flags)) { + if (!tc_skip_hw(prog->gen_flags)) { cmd = TC_CLSBPF_REPLACE; } else if (!tc_skip_sw(prog->gen_flags)) { obj = oldprog; @@ -188,14 +212,14 @@ static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, return -EINVAL; } } else { - if (!tc_should_offload(dev, prog->gen_flags)) + if (tc_skip_hw(prog->gen_flags)) return skip_sw ? -EINVAL : 0; cmd = TC_CLSBPF_ADD; } ret = cls_bpf_offload_cmd(tp, obj, cmd); if (ret) - return skip_sw ? ret : 0; + return ret; obj->offloaded = true; if (oldprog) -- cgit v1.2.3 From 6b3eb752b4b9481868b3393f06a236a1aedfa43f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:45 +0200 Subject: dsa: Convert ndo_setup_tc offloads to block callbacks Benefit from the newly introduced block callback infrastructure and convert ndo_setup_tc calls for matchall offloads to block callbacks. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/dsa/slave.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 6906de0f0050..80142918d5d1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -777,17 +777,9 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev, } static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev, - struct tc_cls_matchall_offload *cls) + struct tc_cls_matchall_offload *cls, + bool ingress) { - bool ingress; - - if (is_classid_clsact_ingress(cls->common.classid)) - ingress = true; - else if (is_classid_clsact_egress(cls->common.classid)) - ingress = false; - else - return -EOPNOTSUPP; - if (cls->common.chain_index) return -EOPNOTSUPP; @@ -802,12 +794,62 @@ static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev, } } +static int dsa_slave_setup_tc_block_cb(enum tc_setup_type type, void *type_data, + void *cb_priv, bool ingress) +{ + struct net_device *dev = cb_priv; + + switch (type) { + case TC_SETUP_CLSMATCHALL: + return dsa_slave_setup_tc_cls_matchall(dev, type_data, ingress); + default: + return -EOPNOTSUPP; + } +} + +static int dsa_slave_setup_tc_block_cb_ig(enum tc_setup_type type, + void *type_data, void *cb_priv) +{ + return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, true); +} + +static int dsa_slave_setup_tc_block_cb_eg(enum tc_setup_type type, + void *type_data, void *cb_priv) +{ + return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, false); +} + +static int dsa_slave_setup_tc_block(struct net_device *dev, + struct tc_block_offload *f) +{ + tc_setup_cb_t *cb; + + if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) + cb = dsa_slave_setup_tc_block_cb_ig; + else if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS) + cb = dsa_slave_setup_tc_block_cb_eg; + else + return -EOPNOTSUPP; + + switch (f->command) { + case TC_BLOCK_BIND: + return tcf_block_cb_register(f->block, cb, dev, dev); + case TC_BLOCK_UNBIND: + tcf_block_cb_unregister(f->block, cb, dev); + return 0; + default: + return -EOPNOTSUPP; + } +} + static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { switch (type) { case TC_SETUP_CLSMATCHALL: - return dsa_slave_setup_tc_cls_matchall(dev, type_data); + return 0; /* will be removed after conversion from ndo */ + case TC_SETUP_BLOCK: + return dsa_slave_setup_tc_block(dev, type_data); default: return -EOPNOTSUPP; } -- cgit v1.2.3 From 8d26d5636dff9fca30816579910aaa9a55b4d96d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:46 +0200 Subject: net: sched: avoid ndo_setup_tc calls for TC_SETUP_CLS* All drivers are converted to use block callbacks for TC_SETUP_CLS*. So it is now safe to remove the calls to ndo_setup_tc from cls_* Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 -- net/sched/cls_bpf.c | 14 -------------- net/sched/cls_flower.c | 20 -------------------- net/sched/cls_matchall.c | 16 ---------------- net/sched/cls_u32.c | 31 ------------------------------- 5 files changed, 83 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 80142918d5d1..d0ae7010ea45 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -846,8 +846,6 @@ static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { switch (type) { - case TC_SETUP_CLSMATCHALL: - return 0; /* will be removed after conversion from ndo */ case TC_SETUP_BLOCK: return dsa_slave_setup_tc_block(dev, type_data); default: diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index e379fdf928bd..0f8b51061c39 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -148,7 +148,6 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, enum tc_clsbpf_command cmd) { bool addorrep = cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE; - struct net_device *dev = tp->q->dev_queue->dev; struct tcf_block *block = tp->chain->block; bool skip_sw = tc_skip_sw(prog->gen_flags); struct tc_cls_bpf_offload cls_bpf = {}; @@ -162,19 +161,6 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, cls_bpf.exts_integrated = prog->exts_integrated; cls_bpf.gen_flags = prog->gen_flags; - if (tc_can_offload(dev)) { - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF, - &cls_bpf); - if (addorrep) { - if (err) { - if (skip_sw) - return err; - } else { - prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; - } - } - } - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); if (addorrep) { if (err < 0) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 76b4e0a1c92f..16f58abaa697 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -200,16 +200,12 @@ static void fl_destroy_filter(struct rcu_head *head) static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = tp->q->dev_queue->dev; struct tcf_block *block = tp->chain->block; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; - if (tc_can_offload(dev)) - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, false); } @@ -219,7 +215,6 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, struct fl_flow_key *mask, struct cls_fl_filter *f) { - struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; bool skip_sw = tc_skip_sw(f->flags); @@ -233,17 +228,6 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; - if (tc_can_offload(dev)) { - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); - if (err) { - if (skip_sw) - return err; - } else { - f->flags |= TCA_CLS_FLAGS_IN_HW; - } - } - err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw); if (err < 0) { @@ -262,7 +246,6 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = tp->q->dev_queue->dev; struct tcf_block *block = tp->chain->block; tc_cls_common_offload_init(&cls_flower.common, tp); @@ -270,9 +253,6 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) cls_flower.cookie = (unsigned long) f; cls_flower.exts = &f->exts; - if (tc_can_offload(dev)) - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, false); } diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 5278534c7e87..70e78d74f6d3 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -54,7 +54,6 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, unsigned long cookie) { - struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_matchall_offload cls_mall = {}; struct tcf_block *block = tp->chain->block; @@ -62,9 +61,6 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp, cls_mall.command = TC_CLSMATCHALL_DESTROY; cls_mall.cookie = cookie; - if (tc_can_offload(dev)) - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, - &cls_mall); tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false); } @@ -72,7 +68,6 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, unsigned long cookie) { - struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_matchall_offload cls_mall = {}; struct tcf_block *block = tp->chain->block; bool skip_sw = tc_skip_sw(head->flags); @@ -83,17 +78,6 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, cls_mall.exts = &head->exts; cls_mall.cookie = cookie; - if (tc_can_offload(dev)) { - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, - &cls_mall); - if (err) { - if (skip_sw) - return err; - } else { - head->flags |= TCA_CLS_FLAGS_IN_HW; - } - } - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw); if (err < 0) { diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index d53da7968eda..9ff17159fb61 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -464,7 +464,6 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) { - struct net_device *dev = tp->q->dev_queue->dev; struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; @@ -474,15 +473,12 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - if (tc_can_offload(dev)) - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); } static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, u32 flags) { - struct net_device *dev = tp->q->dev_queue->dev; struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; bool skip_sw = tc_skip_sw(flags); @@ -495,17 +491,6 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - if (tc_can_offload(dev)) { - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, - &cls_u32); - if (err) { - if (skip_sw) - return err; - } else { - offloaded = true; - } - } - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); if (err < 0) { u32_clear_hw_hnode(tp, h); @@ -522,7 +507,6 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) { - struct net_device *dev = tp->q->dev_queue->dev; struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; @@ -530,15 +514,12 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) cls_u32.command = TC_CLSU32_DELETE_KNODE; cls_u32.knode.handle = handle; - if (tc_can_offload(dev)) - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); } static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32 flags) { - struct net_device *dev = tp->q->dev_queue->dev; struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; bool skip_sw = tc_skip_sw(flags); @@ -560,18 +541,6 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, if (n->ht_down) cls_u32.knode.link_handle = n->ht_down->handle; - - if (tc_can_offload(dev)) { - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, - &cls_u32); - if (err) { - if (skip_sw) - return err; - } else { - n->flags |= TCA_CLS_FLAGS_IN_HW; - } - } - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); if (err < 0) { u32_remove_hw_knode(tp, n->handle); -- cgit v1.2.3 From cb4dc41eaad0cb336bb5ddd379ae0d2cc89cb62b Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 19 Oct 2017 16:42:04 +0200 Subject: tipc: fix broken tipc_poll() function In commit ae236fb208a6 ("tipc: receive group membership events via member socket") we broke the tipc_poll() function by checking the state of the receive queue before the call to poll_sock_wait(), while relying that state afterwards, when it might have changed. We restore this in this commit. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 2bbab4fe2f53..357954ceb25c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -714,7 +714,6 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; u32 revents = 0; @@ -733,7 +732,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, /* fall thru' */ case TIPC_LISTEN: case TIPC_CONNECTING: - if (skb) + if (!skb_queue_empty(&sk->sk_receive_queue)) revents |= POLLIN | POLLRDNORM; break; case TIPC_OPEN: @@ -742,7 +741,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, revents |= POLLOUT; if (!tipc_sk_type_connectionless(sk)) break; - if (!skb) + if (skb_queue_empty(&sk->sk_receive_queue)) break; revents |= POLLIN | POLLRDNORM; break; -- cgit v1.2.3 From e28101a37c3e6b74aeebe4a340636b1782deb830 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 19 Oct 2017 12:43:08 -0500 Subject: net: netrom: nr_in: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/netrom/nr_in.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 80dbd0beb516..fbfdae452ff9 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -125,7 +125,7 @@ static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, case NR_DISCREQ: nr_write_internal(sk, NR_DISCACK); - + /* fall through */ case NR_DISCACK: nr_disconnect(sk, 0); break; -- cgit v1.2.3 From 279badc2a85be83e0187b8c566e3b476b76a87a2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 19 Oct 2017 12:55:03 -0500 Subject: openvswitch: conntrack: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case I placed a "fall through" comment on its own line, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index fe861e2f0deb..b27c5c6d9cab 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -752,6 +752,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, } } /* Non-ICMP, fall thru to initialize if needed. */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. -- cgit v1.2.3 From a05b8c43ac3cc156600a60bf6dcd30109bd42a73 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 19 Oct 2017 13:03:51 -0500 Subject: net: rose: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/rose/rose_in.c | 1 + net/rose/rose_route.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 0a6394754e81..9bbbfe325c5a 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -219,6 +219,7 @@ static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int framety switch (frametype) { case ROSE_RESET_REQUEST: rose_write_internal(sk, ROSE_RESET_CONFIRMATION); + /* fall through */ case ROSE_RESET_CONFIRMATION: rose_stop_timer(sk); rose_start_idletimer(sk); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 65921cd10323..8ca3124df83f 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -346,6 +346,7 @@ static int rose_del_node(struct rose_route_struct *rose_route, case 0: rose_node->neighbour[0] = rose_node->neighbour[1]; + /* fall through */ case 1: rose_node->neighbour[1] = rose_node->neighbour[2]; @@ -507,6 +508,7 @@ void rose_rt_device_down(struct net_device *dev) switch (i) { case 0: t->neighbour[0] = t->neighbour[1]; + /* fall through */ case 1: t->neighbour[1] = t->neighbour[2]; case 2: -- cgit v1.2.3 From f3ae608edb3be2e9a3f668d47aced3553eaf6c14 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 19 Oct 2017 16:28:24 -0500 Subject: net: sched: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 1 + net/sched/sch_drr.c | 1 + net/sched/sch_fq_codel.c | 1 + net/sched/sch_hfsc.c | 1 + net/sched/sch_htb.c | 1 + net/sched/sch_multiq.c | 1 + net/sched/sch_prio.c | 1 + net/sched/sch_qfq.c | 1 + net/sched/sch_sfb.c | 1 + net/sched/sch_sfq.c | 1 + 10 files changed, 10 insertions(+) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index c3b92d62190e..6361be7881f1 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -255,6 +255,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; case TC_ACT_RECLASSIFY: diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 753dc7a77b60..5bbcef3dcd8c 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -321,6 +321,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 3c40edeff1e8..0305d791ea94 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -105,6 +105,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return 0; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index a692184bd333..d04068a97d81 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1144,6 +1144,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 57be73c0e1d2..fa0380730ff0 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -244,6 +244,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 31e0a284eeff..012216386c0b 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -54,6 +54,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 95fad348c8d7..2c79559a0d31 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -50,6 +50,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8694c7b6d2b1..6962b37a3ad3 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -709,6 +709,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 487d375f5a06..0678debdd856 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -268,6 +268,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return false; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 5a9c95149c5c..890f4a4564e7 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -190,6 +190,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return 0; } -- cgit v1.2.3 From 0d5fcebf3c370eb27f9a0e8db454625e73cd1cb4 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 20 Oct 2017 11:21:32 +0200 Subject: tipc: refactor tipc_sk_timeout() function The function tipc_sk_timeout() is more complex than necessary, and even seems to contain an undetected bug. At one of the occurences where we renew the timer we just order it with (HZ / 20), instead of (jiffies + HZ / 20); In this commit we clean up the function. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 357954ceb25c..b3b72d8e9543 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -48,7 +48,7 @@ #include "group.h" #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ -#define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */ +#define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ #define TIPC_FWD_MSG 1 #define TIPC_MAX_PORT 0xffffffff #define TIPC_MIN_PORT 1 @@ -1472,7 +1472,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, msg_set_lookup_scope(msg, 0); msg_set_hdr_sz(msg, SHORT_H_SIZE); - sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL); + sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); tipc_set_sk_state(sk, TIPC_ESTABLISHED); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); @@ -2533,43 +2533,40 @@ static int tipc_shutdown(struct socket *sock, int how) static void tipc_sk_timeout(unsigned long data) { struct tipc_sock *tsk = (struct tipc_sock *)data; + u32 peer_port = tsk_peer_port(tsk); + u32 peer_node = tsk_peer_node(tsk); + u32 own_node = tsk_own_node(tsk); + u32 own_port = tsk->portid; struct sock *sk = &tsk->sk; + struct net *net = sock_net(sk); struct sk_buff *skb = NULL; - u32 peer_port, peer_node; - u32 own_node = tsk_own_node(tsk); bh_lock_sock(sk); - if (!tipc_sk_connected(sk)) { - bh_unlock_sock(sk); + if (!tipc_sk_connected(sk)) + goto exit; + + /* Try again later if socket is busy */ + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); goto exit; } - peer_port = tsk_peer_port(tsk); - peer_node = tsk_peer_node(tsk); if (tsk->probe_unacked) { - if (!sock_owned_by_user(sk)) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk), - tsk_peer_port(tsk)); - sk->sk_state_change(sk); - } else { - /* Try again later */ - sk_reset_timer(sk, &sk->sk_timer, (HZ / 20)); - } - - bh_unlock_sock(sk); + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(net, peer_node, peer_port); + sk->sk_state_change(sk); goto exit; } - - skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, - INT_H_SIZE, 0, peer_node, own_node, - peer_port, tsk->portid, TIPC_OK); + /* Send new probe */ + skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0, + peer_node, own_node, peer_port, own_port, + TIPC_OK); tsk->probe_unacked = true; - sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL); + sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); +exit: bh_unlock_sock(sk); if (skb) - tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid); -exit: + tipc_node_xmit_skb(net, skb, peer_node, own_port); sock_put(sk); } -- cgit v1.2.3 From 110af3acb8cfd79dcb5676a01e07cb2b6afa4c04 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Oct 2017 12:05:30 -0500 Subject: net: af_unix: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/unix/af_unix.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7f46bab4ce5c..a9ee634f3c42 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -814,6 +814,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, */ case SOCK_RAW: sock->type = SOCK_DGRAM; + /* fall through */ case SOCK_DGRAM: sock->ops = &unix_dgram_ops; break; -- cgit v1.2.3 From 0cea8e28df721f929ee3e1fa32489c012100f5a1 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Oct 2017 12:37:52 -0500 Subject: net: x25: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/x25/x25_facilities.c | 2 +- net/x25/x25_in.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c index 997ff7b2509b..ad1734d36ed7 100644 --- a/net/x25/x25_facilities.c +++ b/net/x25/x25_facilities.c @@ -103,7 +103,7 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities, *vc_fac_mask |= X25_MASK_REVERSE; break; } - + /*fall through */ case X25_FAC_THROUGHPUT: facilities->throughput = p[1]; *vc_fac_mask |= X25_MASK_THROUGHPUT; diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 7ac50098a375..3c12cae32001 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -345,6 +345,7 @@ static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametyp case X25_RESET_REQUEST: x25_write_internal(sk, X25_RESET_CONFIRMATION); + /* fall through */ case X25_RESET_CONFIRMATION: { x25_stop_timer(sk); x25->condition = 0x00; -- cgit v1.2.3 From cd86d1fd21025fdd6daf23d1288da405e7ad0ec6 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:40 -0700 Subject: bpf: Adding helper function bpf_getsockops Adding support for helper function bpf_getsockops to socket_ops BPF programs. This patch only supports TCP_CONGESTION. Signed-off-by: Vlad Vysotsky Acked-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 09e011f20291..ccf62f44140a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3273,7 +3273,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, static const struct bpf_func_proto bpf_setsockopt_proto = { .func = bpf_setsockopt, - .gpl_only = true, + .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, @@ -3282,6 +3282,48 @@ static const struct bpf_func_proto bpf_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + struct sock *sk = bpf_sock->sk; + int ret = 0; + + if (!sk_fullsock(sk)) + goto err_clear; + +#ifdef CONFIG_INET + if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { + if (optname == TCP_CONGESTION) { + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!icsk->icsk_ca_ops || optlen <= 1) + goto err_clear; + strncpy(optval, icsk->icsk_ca_ops->name, optlen); + optval[optlen - 1] = 0; + } else { + goto err_clear; + } + } else { + goto err_clear; + } + return ret; +#endif +err_clear: + memset(optval, 0, optlen); + return -EINVAL; +} + +static const struct bpf_func_proto bpf_getsockopt_proto = { + .func = bpf_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3460,6 +3502,8 @@ static const struct bpf_func_proto * switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_getsockopt_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: -- cgit v1.2.3 From 85cce215781685bbc94b6af2bd1aa40b71965a70 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:41 -0700 Subject: bpf: Add BPF_SOCKET_OPS_BASE_RTT support to tcp_nv TCP_NV will try to get the base RTT from a socket_ops BPF program if one is loaded. NV will then use the base RTT to bound its min RTT (its notion of the base RTT). It uses the base RTT as an upper bound and 80% of the base RTT as its lower bound. In other words, NV will consider filtered RTTs larger than base RTT as a sign of congestion. As a result, there is no minRTT inflation when there is a lot of congestion. For example, in a DC where the RTTs are less than 40us when there is no congestion, a base RTT value of 80us improves the performance of NV. The difference between the uncongested RTT and the base RTT provided represents how much queueing we are willing to have (in practice it can be higher). NV has been tunned to reduce congestion when there are many flows at the cost of one flow not achieving full bandwith utilization. When a reasonable base RTT is provided, one NV flow can now fully utilize the full bandwidth. In addition, the performance is also improved when there are many flows. In the following examples the NV results are using a kernel with this patch set (i.e. both NV results are using the new nv_loss_dec_factor). With one host sending to another host and only one flow the goodputs are: Cubic: 9.3 Gbps, NV: 5.5 Gbps, NV (baseRTT=80us): 9.2 Gbps With 2 hosts sending to one host (1 flow per host, the goodput per flow is: Cubic: 4.6 Gbps, NV: 4.5 Gbps, NV (baseRTT=80us)L 4.6 Gbps But the RTTs seen by a ping process in the sender is: Cubic: 3.3ms NV: 97us, NV (baseRTT=80us): 146us With a lot of flows things look even better for NV with baseRTT. Here we have 3 hosts sending to one host. Each sending host has 6 flows: 1 stream, 4x1MB RPC, 1x10KB RPC. Cubic, NV and NV with baseRTT all fully utilize the full available bandwidth. However, the distribution of bandwidth among the flows is very different. For the 10KB RPC flow: Cubic: 27Mbps, NV: 111Mbps, NV (baseRTT=80us): 222Mbps The 99% latencies for the 10KB flows are: Cubic: 26ms, NV: 1ms, NV (baseRTT=80us): 500us The RTT seen by a ping process at the senders: Cubic: 3.2ms NV: 720us, NV (baseRTT=80us): 330us Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/tcp_nv.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 1ff73982e28c..a978e3fa71ec 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -39,7 +39,7 @@ * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected * nv_ssthresh_factor On congestion set ssthresh to this * / 8 * nv_rtt_factor RTT averaging factor - * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur + * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping @@ -61,7 +61,7 @@ static int nv_min_cwnd __read_mostly = 2; static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ -static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ +static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */ static int nv_cwnd_growth_rate_neg __read_mostly = 8; static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ static int nv_dec_eval_min_calls __read_mostly = 60; @@ -101,6 +101,11 @@ struct tcpnv { u32 nv_last_rtt; /* last rtt */ u32 nv_min_rtt; /* active min rtt. Used to determine slope */ u32 nv_min_rtt_new; /* min rtt for future use */ + u32 nv_base_rtt; /* If non-zero it represents the threshold for + * congestion */ + u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is + * set to 80% of nv_base_rtt. It helps reduce + * unfairness between flows */ u32 nv_rtt_max_rate; /* max rate seen during current RTT */ u32 nv_rtt_start_seq; /* current RTT ends when packet arrives * acking beyond nv_rtt_start_seq */ @@ -132,9 +137,24 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) static void tcpnv_init(struct sock *sk) { struct tcpnv *ca = inet_csk_ca(sk); + int base_rtt; tcpnv_reset(ca, sk); + /* See if base_rtt is available from socket_ops bpf program. + * It is meant to be used in environments, such as communication + * within a datacenter, where we have reasonable estimates of + * RTTs + */ + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + if (base_rtt > 0) { + ca->nv_base_rtt = base_rtt; + ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ + } else { + ca->nv_base_rtt = 0; + ca->nv_lower_bound_rtt = 0; + } + ca->nv_allow_cwnd_growth = 1; ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; ca->nv_min_rtt = NV_INIT_RTT; @@ -144,6 +164,19 @@ static void tcpnv_init(struct sock *sk) ca->cwnd_growth_factor = 0; } +/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt) + * bounds to RTT. + */ +inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val) +{ + if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt) + return ca->nv_lower_bound_rtt; + else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt) + return ca->nv_base_rtt; + else + return val; +} + static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -265,6 +298,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) if (ca->nv_eval_call_cnt < 255) ca->nv_eval_call_cnt++; + /* Apply bounds to rtt. Only used to update min_rtt */ + avg_rtt = nv_get_bounded_rtt(ca, avg_rtt); + /* update min rtt if necessary */ if (avg_rtt < ca->nv_min_rtt) ca->nv_min_rtt = avg_rtt; -- cgit v1.2.3 From 058c8d59124193f46db4efa5abcd9d6d0f04c88e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Oct 2017 19:43:11 -0500 Subject: net: core: rtnetlink: use BUG_ON instead of if condition followed by BUG Use BUG_ON instead of if condition followed by BUG in do_setlink. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 04680a53c8dd..df8dba998c48 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2274,8 +2274,7 @@ static int do_setlink(const struct sk_buff *skb, rcu_read_lock(); - if (!(af_ops = rtnl_af_lookup(nla_type(af)))) - BUG(); + BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); err = af_ops->set_link_af(dev, af); if (err < 0) { -- cgit v1.2.3 From d3cc547d9ccbe223b06e87256fdd571eb63762b7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 19 Oct 2017 15:09:47 +0200 Subject: esp6: remove redundant initialization of esph The pointer esph is being initialized with a value that is never read and then being updated. Remove the redundant initialization and move the declaration and initializtion of esph to the local code block. Cleans up clang warning: net/ipv6/esp6.c:562:21: warning: Value stored to 'esph' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Steffen Klassert --- net/ipv6/esp6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 89910e2c10f4..1696401fed6c 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -559,14 +559,14 @@ static void esp_input_restore_header(struct sk_buff *skb) static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) { struct xfrm_state *x = xfrm_input_state(skb); - struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data; /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after * decryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { - esph = skb_push(skb, 4); + struct ip_esp_hdr *esph = skb_push(skb, 4); + *seqhi = esph->spi; esph->spi = esph->seq_no; esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; -- cgit v1.2.3 From c24b14c46bb88d844275de5c4024c8745ae89d42 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:24 -0700 Subject: tcp: add tracepoint trace_tcp_send_reset New tracepoint trace_tcp_send_reset is added and called from tcp_v4_send_reset(), tcp_v6_send_reset() and tcp_send_active_reset(). Signed-off-by: Song Liu Signed-off-by: David S. Miller --- net/core/net-traces.c | 2 ++ net/ipv4/tcp_ipv4.c | 6 +++++- net/ipv4/tcp_output.c | 5 +++++ net/ipv6/tcp_ipv6.c | 10 ++++++++-- 4 files changed, 20 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/net-traces.c b/net/core/net-traces.c index f4e4fa2db505..8dcd9b0be04a 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -49,3 +49,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); + +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e22439f05e46..eb3f3b8e1e4b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,6 +85,8 @@ #include #include +#include + #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -701,8 +703,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) + if (sk) { arg.bound_dev_if = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 988733f289c8..1f01f4c9c738 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3084,6 +3084,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + + /* skb of trace_tcp_send_reset() keeps the skb that caused RST, + * skb here is different to the troublesome skb, so use NULL + */ + trace_tcp_send_reset(sk, NULL); } /* Send a crossed SYN-ACK during socket establishment. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ae83615b7f6d..0e2529958b52 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -69,6 +69,8 @@ #include #include +#include + static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); @@ -890,7 +892,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) int genhash; struct sock *sk1 = NULL; #endif - int oif; + int oif = 0; if (th->rst) return; @@ -939,7 +941,11 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); - oif = sk ? sk->sk_bound_dev_if : 0; + if (sk) { + oif = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); #ifdef CONFIG_TCP_MD5SIG -- cgit v1.2.3 From 5941521c05d69cf3f2b1293eefd21207e083b70f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:25 -0700 Subject: tcp: add tracepoint trace_tcp_receive_reset New tracepoint trace_tcp_receive_reset is added and called from tcp_reset(). This tracepoint is define with a new class tcp_event_sk. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ab3f12898245..c5e64d4b5839 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -75,6 +75,7 @@ #include #include #include +#include int sysctl_tcp_fack __read_mostly; int sysctl_tcp_max_reordering __read_mostly = 300; @@ -4010,6 +4011,8 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) /* When we get a reset we do this. */ void tcp_reset(struct sock *sk) { + trace_tcp_receive_reset(sk); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: -- cgit v1.2.3 From e1a4aa50f47303ebb3ca0cfd01687884551ce03d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:26 -0700 Subject: tcp: add tracepoint trace_tcp_destroy_sock This patch adds trace event trace_tcp_destroy_sock. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index eb3f3b8e1e4b..23a8100af5ad 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1869,6 +1869,8 @@ void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + trace_tcp_destroy_sock(sk); + tcp_clear_xmit_timers(sk); tcp_cleanup_congestion_control(sk); -- cgit v1.2.3 From e8fce23946b7e7eadf25ad78d8207c22903dfe27 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:27 -0700 Subject: tcp: add tracepoint trace_tcp_set_state() This patch adds tracepoint trace_tcp_set_state. Besides usual fields (s/d ports, IP addresses), old and new state of the socket is also printed with TP_printk, with __print_symbolic(). Signed-off-by: Song Liu Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8b1fa4dd4538..be07e9b6dbdd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,6 +282,8 @@ #include #include +#include + int sysctl_tcp_min_tso_segs __read_mostly = 2; int sysctl_tcp_autocorking __read_mostly = 1; @@ -2040,6 +2042,8 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + trace_tcp_set_state(sk, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) -- cgit v1.2.3 From 9523feac272ccad2ad8186ba4fcc89103754de52 Mon Sep 17 00:00:00 2001 From: Tuomas Tynkkynen Date: Wed, 6 Sep 2017 17:59:08 +0300 Subject: net/9p: Switch to wait_event_killable() Because userspace gets Very Unhappy when calls like stat() and execve() return -EINTR on 9p filesystem mounts. For instance, when bash is looking in PATH for things to execute and some SIGCHLD interrupts stat(), bash can throw a spurious 'command not found' since it doesn't retry the stat(). In practice, hitting the problem is rare and needs a really slow/bogged down 9p server. Cc: stable@vger.kernel.org Signed-off-by: Tuomas Tynkkynen Signed-off-by: Al Viro --- net/9p/client.c | 3 +-- net/9p/trans_virtio.c | 13 ++++++------- net/9p/trans_xen.c | 4 ++-- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 4674235b0d9b..1beb131dd3e1 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -773,8 +773,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) } again: /* Wait for the response */ - err = wait_event_interruptible(*req->wq, - req->status >= REQ_STATUS_RCVD); + err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD); /* * Make sure our req is coherent with regard to updates in other diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index f24b25c25106..f3a4efcf1456 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -286,8 +286,8 @@ req_retry: if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); - err = wait_event_interruptible(*chan->vc_wq, - chan->ring_bufs_avail); + err = wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); if (err == -ERESTARTSYS) return err; @@ -327,7 +327,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, * Other zc request to finish here */ if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { - err = wait_event_interruptible(vp_wq, + err = wait_event_killable(vp_wq, (atomic_read(&vp_pinned) < chan->p9_max_pages)); if (err == -ERESTARTSYS) return err; @@ -471,8 +471,8 @@ req_retry_pinned: if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); - err = wait_event_interruptible(*chan->vc_wq, - chan->ring_bufs_avail); + err = wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); if (err == -ERESTARTSYS) goto err_out; @@ -489,8 +489,7 @@ req_retry_pinned: virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); - err = wait_event_interruptible(*req->wq, - req->status >= REQ_STATUS_RCVD); + err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD); /* * Non kernel buffers are pinned, unpin them */ diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 6ad3e043c617..325c56043007 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -156,8 +156,8 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) ring = &priv->rings[num]; again: - while (wait_event_interruptible(ring->wq, - p9_xen_write_todo(ring, size)) != 0) + while (wait_event_killable(ring->wq, + p9_xen_write_todo(ring, size)) != 0) ; spin_lock_irqsave(&ring->lock, flags); -- cgit v1.2.3 From 56fc709b7a9fe191173dc772a881e180458db517 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:45 -0700 Subject: ipv6: addrconf: move ipv6_chk_same_addr() to avoid forward declaration ipv6_chk_same_addr() is only used by ipv6_add_addr_hash(), so moving it avoids a forward declaration. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 93f9c0a61911..9228030e3497 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -192,8 +192,6 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void inet6_prefix_notify(int event, struct inet6_dev *idev, struct prefix_info *pinfo); -static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev); static struct ipv6_devconf ipv6_devconf __read_mostly = { .forwarding = 0, @@ -957,6 +955,23 @@ static u32 inet6_addr_hash(const struct in6_addr *addr) return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); } +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev) +{ + unsigned int hash = inet6_addr_hash(addr); + struct inet6_ifaddr *ifp; + + hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (!dev || ifp->idev->dev == dev) + return true; + } + } + return false; +} + static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) { unsigned int hash; @@ -1856,22 +1871,6 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, } EXPORT_SYMBOL(ipv6_chk_addr_and_flags); -static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev) -{ - unsigned int hash = inet6_addr_hash(addr); - struct inet6_ifaddr *ifp; - - hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { - if (!net_eq(dev_net(ifp->idev->dev), net)) - continue; - if (ipv6_addr_equal(&ifp->addr, addr)) { - if (!dev || ifp->idev->dev == dev) - return true; - } - } - return false; -} /* Compares an address/prefix_len with addresses on device @dev. * If one is found it returns true. -- cgit v1.2.3 From 752a92927e97e88096394dac3f10d12a58555254 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:46 -0700 Subject: ipv6: addrconf: factorize inet6_addr_hash() call ipv6_add_addr_hash() can compute the hash value outside of locked section and pass it to ipv6_chk_same_addr(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9228030e3497..c1a5028f394c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -956,9 +956,8 @@ static u32 inet6_addr_hash(const struct in6_addr *addr) } static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev) + struct net_device *dev, unsigned int hash) { - unsigned int hash = inet6_addr_hash(addr); struct inet6_ifaddr *ifp; hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { @@ -974,23 +973,19 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) { - unsigned int hash; + unsigned int hash = inet6_addr_hash(&ifa->addr); int err = 0; spin_lock(&addrconf_hash_lock); /* Ignore adding duplicate addresses on an interface */ - if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev)) { + if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) { ADBG("ipv6_add_addr: already assigned\n"); err = -EEXIST; - goto out; + } else { + hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); } - /* Add to big hash table */ - hash = inet6_addr_hash(&ifa->addr); - hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); - -out: spin_unlock(&addrconf_hash_lock); return err; -- cgit v1.2.3 From 3f27fb23219e75343b094366f2358bff34300493 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:47 -0700 Subject: ipv6: addrconf: add per netns perturbation in inet6_addr_hash() Bring IPv6 in par with IPv4 : - Use net_hash_mix() to spread addresses a bit more. - Use 256 slots hash table instead of 16 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c1a5028f394c..d70d98122053 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -950,9 +950,11 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) list_add_tail_rcu(&ifp->if_list, p); } -static u32 inet6_addr_hash(const struct in6_addr *addr) +static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr) { - return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); + u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + + return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, @@ -973,7 +975,7 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) { - unsigned int hash = inet6_addr_hash(&ifa->addr); + unsigned int hash = inet6_addr_hash(dev_net(dev), &ifa->addr); int err = 0; spin_lock(&addrconf_hash_lock); @@ -1838,8 +1840,8 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict, u32 banned_flags) { + unsigned int hash = inet6_addr_hash(net, addr); struct inet6_ifaddr *ifp; - unsigned int hash = inet6_addr_hash(addr); u32 ifp_flags; rcu_read_lock_bh(); @@ -1917,8 +1919,8 @@ EXPORT_SYMBOL(ipv6_chk_prefix); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict) { + unsigned int hash = inet6_addr_hash(net, addr); struct inet6_ifaddr *ifp, *result = NULL; - unsigned int hash = inet6_addr_hash(addr); rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { @@ -4242,9 +4244,9 @@ void if6_proc_exit(void) /* Check if address is a home address configured on any interface. */ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) { - int ret = 0; + unsigned int hash = inet6_addr_hash(net, addr); struct inet6_ifaddr *ifp = NULL; - unsigned int hash = inet6_addr_hash(addr); + int ret = 0; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { -- cgit v1.2.3 From 480318a0a4d8b0d03ddedba1bd82e78050ede661 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:48 -0700 Subject: ipv6: addrconf: do not block BH in ipv6_chk_addr_and_flags() rcu_read_lock() is enough here, as inet6_ifa_finish_destroy() uses kfree_rcu() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d70d98122053..a6cf37b7e34c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1844,7 +1844,7 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, struct inet6_ifaddr *ifp; u32 ifp_flags; - rcu_read_lock_bh(); + rcu_read_lock(); hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; @@ -1858,12 +1858,12 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, !(ifp_flags&banned_flags) && (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) { - rcu_read_unlock_bh(); + rcu_read_unlock(); return 1; } } - rcu_read_unlock_bh(); + rcu_read_unlock(); return 0; } EXPORT_SYMBOL(ipv6_chk_addr_and_flags); -- cgit v1.2.3 From 24f226da96278bf3956dac6fa293ef95bc22a90c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:49 -0700 Subject: ipv6: addrconf: do not block BH in ipv6_get_ifaddr() rcu_read_lock() is enough here, no need to block BH. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a6cf37b7e34c..6c1e7ffb62ff 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1922,8 +1922,8 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add unsigned int hash = inet6_addr_hash(net, addr); struct inet6_ifaddr *ifp, *result = NULL; - rcu_read_lock_bh(); - hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { + rcu_read_lock(); + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { @@ -1935,7 +1935,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add } } } - rcu_read_unlock_bh(); + rcu_read_unlock(); return result; } -- cgit v1.2.3 From a5c1d98f8ccf4359575772b36ed51f5857dd7165 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:50 -0700 Subject: ipv6: addrconf: do not block BH in /proc/net/if_inet6 handling Table is really RCU protected, no need to block BH Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c1e7ffb62ff..9232e9537082 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4097,9 +4097,9 @@ struct if6_iter_state { static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) { - struct inet6_ifaddr *ifa = NULL; struct if6_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + struct inet6_ifaddr *ifa = NULL; int p = 0; /* initial bucket if pos is 0 */ @@ -4109,7 +4109,7 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) } for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { - hlist_for_each_entry_rcu_bh(ifa, &inet6_addr_lst[state->bucket], + hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket], addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; @@ -4135,7 +4135,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct if6_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - hlist_for_each_entry_continue_rcu_bh(ifa, addr_lst) { + hlist_for_each_entry_continue_rcu(ifa, addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; state->offset++; @@ -4144,7 +4144,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, while (++state->bucket < IN6_ADDR_HSIZE) { state->offset = 0; - hlist_for_each_entry_rcu_bh(ifa, + hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket], addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; @@ -4157,9 +4157,9 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, } static void *if6_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(rcu_bh) + __acquires(rcu) { - rcu_read_lock_bh(); + rcu_read_lock(); return if6_get_first(seq, *pos); } @@ -4173,9 +4173,9 @@ static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void if6_seq_stop(struct seq_file *seq, void *v) - __releases(rcu_bh) + __releases(rcu) { - rcu_read_unlock_bh(); + rcu_read_unlock(); } static int if6_seq_show(struct seq_file *seq, void *v) -- cgit v1.2.3 From 4e5f47ab97ce68d9f766dfedc5762940a90e4c11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:51 -0700 Subject: ipv6: addrconf: do not block BH in ipv6_chk_home_addr() rcu_read_lock() is enough here, no need to block BH. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9232e9537082..5a8a10229a07 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4248,8 +4248,8 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) struct inet6_ifaddr *ifp = NULL; int ret = 0; - rcu_read_lock_bh(); - hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { + rcu_read_lock(); + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr) && @@ -4258,7 +4258,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) break; } } - rcu_read_unlock_bh(); + rcu_read_unlock(); return ret; } #endif -- cgit v1.2.3 From e3cf39706b89001fd7f3eba0aa70aa40379eef30 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 19 Oct 2017 16:54:48 -0500 Subject: net: rxrpc: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 2 ++ net/rxrpc/input.c | 1 + net/rxrpc/sendmsg.c | 1 + 3 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 054e32872808..344b2dcad52d 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -246,6 +246,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) ret = 0; break; } + /* Fall through */ default: ret = -EBUSY; break; @@ -560,6 +561,7 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) m->msg_name = &rx->connect_srx; m->msg_namelen = sizeof(rx->connect_srx); } + /* Fall through */ case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: ret = rxrpc_do_sendmsg(rx, m, len); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index e56e23ed2229..1e37eb1c0c66 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1125,6 +1125,7 @@ void rxrpc_data_ready(struct sock *udp_sk) case RXRPC_PACKET_TYPE_BUSY: if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) goto discard; + /* Fall through */ case RXRPC_PACKET_TYPE_DATA: if (sp->hdr.callNumber == 0) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 2d9edc656ca3..7d2595582c09 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -220,6 +220,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, ktime_get_real()); if (!last) break; + /* Fall through */ case RXRPC_CALL_SERVER_SEND_REPLY: call->state = RXRPC_CALL_SERVER_AWAIT_ACK; rxrpc_notify_end_tx(rx, call, notify_end_tx); -- cgit v1.2.3 From 7f6b437e9b82a6d702a7f8f75c83ffdec6e03c54 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 21 Oct 2017 20:35:30 -0500 Subject: net: smc_close: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case I placed the "fall through" comment on its own line, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/smc/smc_close.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index f0d16fb825f7..a6c65595f248 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -360,7 +360,8 @@ static void smc_close_passive_work(struct work_struct *work) case SMC_PEERCLOSEWAIT1: if (rxflags->peer_done_writing) sk->sk_state = SMC_PEERCLOSEWAIT2; - /* fall through to check for closing */ + /* fall through */ + /* to check for closing */ case SMC_PEERCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: if (!smc_cdc_rxed_any_close(&smc->conn)) -- cgit v1.2.3 From 152854025528b30c5ca5113a443ead98c3f1e7a5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 23 Oct 2017 13:08:14 -0500 Subject: ipv4: icmp: use BUG_ON instead of if condition followed by BUG Use BUG_ON instead of if condition followed by BUG in icmp_timestamp. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 3c1570d3e22f..1617604c9284 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -968,8 +968,9 @@ static bool icmp_timestamp(struct sk_buff *skb) */ icmp_param.data.times[1] = inet_current_timestamp(); icmp_param.data.times[2] = icmp_param.data.times[1]; - if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) - BUG(); + + BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)); + icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; icmp_param.data.icmph.code = 0; -- cgit v1.2.3 From 49ca1943a7adb429b11b8e05d81bc821694b76c7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 23 Oct 2017 13:10:56 -0500 Subject: ipv4: tcp_minisocks: use BUG_ON instead of if condition followed by BUG Use BUG_ON instead of if condition followed by BUG in tcp_time_wait. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2341b9f857b6..a952357054f4 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -298,8 +298,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()) - BUG(); + BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()); } } while (0); #endif -- cgit v1.2.3 From 71c02379c762cb616c00fd5c4ed253fbf6bbe11b Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 23 Oct 2017 13:22:23 -0700 Subject: tcp: Configure TFO without cookie per socket and/or per route We already allow to enable TFO without a cookie by using the fastopen-sysctl and setting it to TFO_SERVER_COOKIE_NOT_REQD (or TFO_CLIENT_NO_COOKIE). This is safe to do in certain environments where we know that there isn't a malicous host (aka., data-centers) or when the application-protocol already provides an authentication mechanism in the first flight of data. A server however might be providing multiple services or talking to both sides (public Internet and data-center). So, this server would want to enable cookie-less TFO for certain services and/or for connections that go to the data-center. This patch exposes a socket-option and a per-route attribute to enable such fine-grained configurations. Signed-off-by: Christoph Paasch Reviewed-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 12 ++++++++++++ net/ipv4/tcp_fastopen.c | 20 +++++++++++++++++--- net/ipv4/tcp_input.c | 2 +- 3 files changed, 30 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index be07e9b6dbdd..8f36277e82e9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2836,6 +2836,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EOPNOTSUPP; } break; + case TCP_FASTOPEN_NO_COOKIE: + if (val > 1 || val < 0) + err = -EINVAL; + else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + err = -EINVAL; + else + tp->fastopen_no_cookie = val; + break; case TCP_TIMESTAMP: if (!tp->repair) err = -EPERM; @@ -3256,6 +3264,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_connect; break; + case TCP_FASTOPEN_NO_COOKIE: + val = tp->fastopen_no_cookie; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 21075ce19cb6..e0a4b56644aa 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -310,13 +310,23 @@ static bool tcp_fastopen_queue_check(struct sock *sk) return true; } +static bool tcp_fastopen_no_cookie(const struct sock *sk, + const struct dst_entry *dst, + int flag) +{ + return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) || + tcp_sk(sk)->fastopen_no_cookie || + (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); +} + /* Returns true if we should perform Fast Open on the SYN. The cookie (foc) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + const struct dst_entry *dst) { bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; @@ -333,7 +343,8 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, return NULL; } - if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + if (syn_data && + tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ @@ -370,6 +381,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { unsigned long last_syn_loss = 0; + const struct dst_entry *dst; int syn_loss = 0; tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); @@ -387,7 +399,9 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, return false; } - if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + dst = __sk_dst_get(sk); + + if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) { cookie->len = -1; return true; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c5e64d4b5839..893286db4623 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6332,7 +6332,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_openreq_init_rwin(req, sk, dst); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); - fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, -- cgit v1.2.3 From 87b1af8dcc084d3d509d55067185ca74aed70ce2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 23 Oct 2017 14:59:35 -0700 Subject: ipv6: add ip6_null_entry check in rt6_select() In rt6_select(), fn->leaf could be pointing to net->ipv6.ip6_null_entry. In this case, we should directly return instead of trying to carry on with the rest of the process. If not, we could crash at: spin_lock_bh(&leaf->rt6i_table->rt6_lock); because net->ipv6.ip6_null_entry does not have rt6i_table set. Syzkaller recently reported following issue on net-next: Use struct sctp_sack_info instead kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: sctp: [Deprecated]: syz-executor4 (pid 26496) Use of struct sctp_assoc_value in delayed_ack socket option. Use struct sctp_sack_info instead CPU: 1 PID: 26523 Comm: syz-executor6 Not tainted 4.14.0-rc4+ #85 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d147e3c0 task.stack: ffff8801a4328000 RIP: 0010:debug_spin_lock_before kernel/locking/spinlock_debug.c:83 [inline] RIP: 0010:do_raw_spin_lock+0x23/0x1e0 kernel/locking/spinlock_debug.c:112 RSP: 0018:ffff8801a432ed70 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 0000000000000018 RCX: 0000000000000000 RDX: 0000000000000003 RSI: 0000000000000000 RDI: 000000000000001c RBP: ffff8801a432ed90 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: ffffffff8482b279 R12: ffff8801ce2ff3a0 sctp: [Deprecated]: syz-executor1 (pid 26546) Use of int in maxseg socket option. Use struct sctp_assoc_value instead R13: dffffc0000000000 R14: ffff8801d971e000 R15: ffff8801ce2ff0d8 FS: 00007f56e82f5700(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001ddbc22000 CR3: 00000001a4a04000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __raw_spin_lock_bh include/linux/spinlock_api_smp.h:136 [inline] _raw_spin_lock_bh+0x39/0x40 kernel/locking/spinlock.c:175 spin_lock_bh include/linux/spinlock.h:321 [inline] rt6_select net/ipv6/route.c:786 [inline] ip6_pol_route+0x1be3/0x3bd0 net/ipv6/route.c:1650 sctp: [Deprecated]: syz-executor1 (pid 26576) Use of int in maxseg socket option. Use struct sctp_assoc_value instead TCP: request_sock_TCPv6: Possible SYN flooding on port 20002. Sending cookies. Check SNMP counters. ip6_pol_route_output+0x4c/0x60 net/ipv6/route.c:1843 fib6_rule_lookup+0x9e/0x2a0 net/ipv6/ip6_fib.c:309 ip6_route_output_flags+0x1f1/0x2b0 net/ipv6/route.c:1871 ip6_route_output include/net/ip6_route.h:80 [inline] ip6_dst_lookup_tail+0x4ea/0x970 net/ipv6/ip6_output.c:953 ip6_dst_lookup_flow+0xc8/0x270 net/ipv6/ip6_output.c:1076 sctp_v6_get_dst+0x675/0x1c30 net/sctp/ipv6.c:274 sctp_transport_route+0xa8/0x430 net/sctp/transport.c:287 sctp_assoc_add_peer+0x4fe/0x1100 net/sctp/associola.c:656 __sctp_connect+0x251/0xc80 net/sctp/socket.c:1187 sctp_connect+0xb4/0xf0 net/sctp/socket.c:4209 inet_dgram_connect+0x16b/0x1f0 net/ipv4/af_inet.c:541 SYSC_connect+0x20a/0x480 net/socket.c:1642 SyS_connect+0x24/0x30 net/socket.c:1623 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 46c59a53c53f..605e5dc1c010 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -752,7 +752,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, bool do_rr = false; int key_plen; - if (!leaf) + if (!leaf || leaf == net->ipv6.ip6_null_entry) return net->ipv6.ip6_null_entry; rt0 = rcu_dereference(fn->rr_ptr); -- cgit v1.2.3 From 9d452cebd7d69e9eb22b4c0482fdbb6fc762167f Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 24 Oct 2017 08:58:02 +0300 Subject: net/sched: Fix actions list corruption when adding offloaded tc flows Prior to commit b3f55bdda8df, the networking core doesn't wire an in-place actions list the when the low level driver is called to offload the flow, but all low level drivers do that (call tcf_exts_to_list()) in their offloading "add" logic. Now, the in-place list is set in the core which goes over the list in a loop, but also by the hw driver when their offloading code is invoked indirectly: cls_xxx add flow -> tc_setup_cb_call -> tc_exts_setup_cb_egdev_call -> hw driver which messes up the core list instance upon driver return. Fix that by avoiding in-place list on the net core code that deals with adding flows. Fixes: b3f55bdda8df ('net: sched: introduce per-egress action device callbacks') Signed-off-by: Or Gerlitz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index cdfdc24b89cf..0e96cdae9995 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1184,14 +1184,13 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, #ifdef CONFIG_NET_CLS_ACT const struct tc_action *a; struct net_device *dev; - LIST_HEAD(actions); - int ret; + int i, ret; if (!tcf_exts_has_actions(exts)) return 0; - tcf_exts_to_list(exts, &actions); - list_for_each_entry(a, &actions, list) { + for (i = 0; i < exts->nr_actions; i++) { + a = exts->actions[i]; if (!a->ops->get_dev) continue; dev = a->ops->get_dev(a); -- cgit v1.2.3 From 2420770b3fe56ca97ecf34e230762cd9f3296dae Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 5 Oct 2017 16:46:45 +0200 Subject: netfilter: nat: use test_and_clear_bit when deleting ct from bysource list We can use a single statement for this. While at it, fixup the comment -- we don't have pernet table/ops anymore, the function is only called from module exit path. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index af8345fc4fbd..6c38421e31f9 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -542,17 +542,14 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) if (nf_nat_proto_remove(ct, data)) return 1; - if ((ct->status & IPS_SRC_NAT_DONE) == 0) - return 0; - - /* This netns is being destroyed, and conntrack has nat null binding. + /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ - clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); - __nf_nat_cleanup_conntrack(ct); + if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) + __nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. -- cgit v1.2.3 From c4f3db15958277c03d1c324894255ea3ecbf86e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:40 +0200 Subject: netfilter: conntrack: add and use nf_l4proto_log_invalid We currently pass down the l4 protocol to the conntrack ->packet() function, but the only user of this is the debug info decision. Same information can be derived from struct nf_conn. As a first step, add and use a new log function for this, similar to nf_ct_helper_log(). Add __cold annotation -- invalid packets should be infrequent so gcc can consider all call paths that lead to such a function as unlikely. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 18 ++++++------ net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 14 +++++---- net/netfilter/nf_conntrack_proto.c | 24 ++++++++++++++++ net/netfilter/nf_conntrack_proto_dccp.c | 3 +- net/netfilter/nf_conntrack_proto_sctp.c | 3 +- net/netfilter/nf_conntrack_proto_tcp.c | 22 +++++++------- net/netfilter/nf_conntrack_proto_udp.c | 40 ++++++++++++-------------- 7 files changed, 72 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index a046c298413a..7281a7b77a0e 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -165,6 +165,12 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return NF_ACCEPT; } +static void icmp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); +} + /* Small and modified version of icmp_rcv */ static int icmp_error(struct net *net, struct nf_conn *tmpl, @@ -177,18 +183,14 @@ icmp_error(struct net *net, struct nf_conn *tmpl, /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, - NULL, "nf_ct_icmp: short packet "); + icmp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: bad HW ICMP checksum "); + icmp_error_log(skb, net, pf, "bad hw icmp checksum"); return -NF_ACCEPT; } @@ -199,9 +201,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, * discarded. */ if (icmph->type > NR_ICMP_TYPES) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: invalid ICMP type "); + icmp_error_log(skb, net, pf, "invalid icmp type"); return -NF_ACCEPT; } diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index a9e1fd1a8536..0f227ca4a5a2 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -176,6 +176,12 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } +static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); +} + static int icmpv6_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -187,17 +193,13 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { - if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, - "nf_ct_icmpv6: short packet "); + icmpv6_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { - if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, - "nf_ct_icmpv6: ICMPv6 checksum failed "); + icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index b3e489c859ec..bcd3ee270d75 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -27,6 +27,7 @@ #include #include #include +#include static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly; @@ -63,6 +64,29 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, *header = NULL; *table = NULL; } + +__printf(5, 6) +void nf_l4proto_log_invalid(const struct sk_buff *skb, + struct net *net, + u16 pf, u8 protonum, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (net->ct.sysctl_log_invalid != protonum || + net->ct.sysctl_log_invalid != IPPROTO_RAW) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_proto_%d: %pV ", protonum, &vaf); + va_end(args); +} +EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid); #endif const struct nf_conntrack_l4proto * diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 0f5a4d79f6b8..ef501c7edb96 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -604,8 +604,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; out_invalid: - if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg); + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_DCCP, "%s", msg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 6303a88af12b..aa630c561361 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -522,8 +522,7 @@ static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb, } return NF_ACCEPT; out_invalid: - if (LOG_INVALID(net, IPPROTO_SCTP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg); + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_SCTP, "%s", logmsg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index cba1c6ffe51a..14198b2a2e2c 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -738,6 +738,12 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| [TCPHDR_ACK|TCPHDR_URG] = 1, }; +static void tcp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg); +} + /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ static int tcp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, @@ -753,17 +759,13 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: short packet "); + tcp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: truncated/malformed packet "); + tcp_error_log(skb, net, pf, "truncated packet"); return -NF_ACCEPT; } @@ -774,18 +776,14 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, /* FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: bad TCP checksum "); + tcp_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } /* Check TCP flags. */ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid TCP flag combination "); + tcp_error_log(skb, net, pf, "invalid tcp flag combination"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8af734cd1a94..fc20cf430251 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -99,6 +99,12 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, } #ifdef CONFIG_NF_CT_PROTO_UDPLITE +static void udplite_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDPLITE, "%s", msg); +} + static int udplite_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -112,9 +118,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: short packet "); + udplite_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } @@ -122,17 +126,13 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, if (cscov == 0) { cscov = udplen; } else if (cscov < sizeof(*hdr) || cscov > udplen) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: invalid checksum coverage "); + udplite_error_log(skb, net, pf, "invalid checksum coverage"); return -NF_ACCEPT; } /* UDPLITE mandates checksums */ if (!hdr->check) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: checksum missing "); + udplite_error_log(skb, net, pf, "checksum missing"); return -NF_ACCEPT; } @@ -140,9 +140,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: bad UDPLite checksum "); + udplite_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } @@ -150,6 +148,12 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, } #endif +static void udp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDP, "%s", msg); +} + static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int8_t pf, @@ -162,17 +166,13 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { - if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udp: short packet "); + udp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udp: truncated/malformed packet "); + udp_error_log(skb, net, pf, "truncated/malformed packet"); return -NF_ACCEPT; } @@ -186,9 +186,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, * FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { - if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udp: bad UDP checksum "); + udp_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } -- cgit v1.2.3 From 3d0b527bc9dc0e8c4428eb1a98d4cd27bd1114c7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:41 +0200 Subject: netfilter: conntrack: add and use nf_ct_l4proto_log_invalid We currently pass down the l4 protocol to the conntrack ->packet() function, but the only user of this is the debug info decision. Same information can be derived from struct nf_conn. Add a wrapper for the previous patch that extracs the information from nf_conn and passes it to nf_l4proto_log_invalid(). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto.c | 23 +++++++++++++++++++++++ net/netfilter/nf_conntrack_proto_dccp.c | 17 +++++------------ net/netfilter/nf_conntrack_proto_tcp.c | 25 +++++++++---------------- 3 files changed, 37 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index bcd3ee270d75..83f739e9dc08 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -87,6 +87,29 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb, va_end(args); } EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid); + +__printf(3, 4) +void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, + const struct nf_conn *ct, + const char *fmt, ...) +{ + struct va_format vaf; + struct net *net; + va_list args; + + net = nf_ct_net(ct); + if (likely(net->ct.sysctl_log_invalid == 0)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + nf_l4proto_log_invalid(skb, net, nf_ct_l3num(ct), + nf_ct_protonum(ct), "%pV", &vaf); + va_end(args); +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid); #endif const struct nf_conntrack_l4proto * diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index ef501c7edb96..49e0abcdc6f4 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -428,13 +428,13 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, default: dn = dccp_pernet(net); if (dn->dccp_loose == 0) { - msg = "nf_ct_dccp: not picking up existing connection "; + msg = "not picking up existing connection "; goto out_invalid; } case CT_DCCP_REQUEST: break; case CT_DCCP_INVALID: - msg = "nf_ct_dccp: invalid state transition "; + msg = "invalid state transition "; goto out_invalid; } @@ -447,9 +447,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; out_invalid: - if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL, - NULL, "%s", msg); + nf_ct_l4proto_log_invalid(skb, ct, "%s", msg); return false; } @@ -472,7 +470,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, u_int8_t pf, unsigned int *timeouts) { - struct net *net = nf_ct_net(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; u_int8_t type, old_state, new_state; @@ -534,15 +531,11 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.dccp.last_pkt = type; spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_dccp: invalid packet ignored "); + nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid packet"); return NF_ACCEPT; case CT_DCCP_INVALID: spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_dccp: invalid state transition "); + nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid state transition"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 14198b2a2e2c..dced574f6006 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -702,9 +702,9 @@ static bool tcp_in_window(const struct nf_conn *ct, if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal) res = true; - if (!res && LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: %s ", + if (!res) { + nf_ct_l4proto_log_invalid(skb, ct, + "%s", before(seq, sender->td_maxend + 1) ? in_recv_win ? before(sack, receiver->td_end + 1) ? @@ -713,6 +713,7 @@ static bool tcp_in_window(const struct nf_conn *ct, : "ACK is over the upper bound (ACKed data not seen yet)" : "SEQ is under the lower bound (already ACKed data retransmitted)" : "SEQ is over the upper bound (over the window of the receiver)"); + } } pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " @@ -937,10 +938,8 @@ static int tcp_packet(struct nf_conn *ct, IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid packet ignored in " - "state %s ", tcp_conntrack_names[old_state]); + nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in " + "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or @@ -962,9 +961,7 @@ static int tcp_packet(struct nf_conn *ct, pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid state "); + nf_ct_l4proto_log_invalid(skb, ct, "invalid state"); return -NF_ACCEPT; case TCP_CONNTRACK_TIME_WAIT: /* RFC5961 compliance cause stack to send "challenge-ACK" @@ -979,9 +976,7 @@ static int tcp_packet(struct nf_conn *ct, /* Detected RFC5961 challenge ACK */ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: challenge-ACK ignored "); + nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored"); return NF_ACCEPT; /* Don't change state */ } break; @@ -991,9 +986,7 @@ static int tcp_packet(struct nf_conn *ct, && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) { /* Invalid RST */ spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, - NULL, "nf_ct_tcp: invalid RST "); + nf_ct_l4proto_log_invalid(skb, ct, "invalid rst"); return -NF_ACCEPT; } if (index == TCP_RST_SET -- cgit v1.2.3 From eb6fad5a4a328b85d3faa8b301b522e3f316b49d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:42 +0200 Subject: netfilter: conntrack: remove pf argument from l4 packet functions not needed/used anymore. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 1 - net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 1 - net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_proto_dccp.c | 1 - net/netfilter/nf_conntrack_proto_generic.c | 1 - net/netfilter/nf_conntrack_proto_gre.c | 1 - net/netfilter/nf_conntrack_proto_sctp.c | 1 - net/netfilter/nf_conntrack_proto_tcp.c | 6 ++---- net/netfilter/nf_conntrack_proto_udp.c | 1 - 9 files changed, 3 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 7281a7b77a0e..8969420cecc3 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -81,7 +81,6 @@ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { /* Do not immediately delete the connection after the first diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 0f227ca4a5a2..dca921df28e1 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -94,7 +94,6 @@ static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { /* Do not immediately delete the connection after the first diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 01130392b7c0..28e675150853 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1419,7 +1419,7 @@ repeat: /* Decide what timeout policy we want to apply to this flow. */ timeouts = nf_ct_timeout_lookup(net, ct, l4proto); - ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, timeouts); + ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 49e0abcdc6f4..2a446f4a554c 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -467,7 +467,6 @@ static unsigned int *dccp_get_timeouts(struct net *net) static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 9cd40700842e..1f86ddf6649a 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -60,7 +60,6 @@ static int generic_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 09a90484c27d..a2503005d80b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -244,7 +244,6 @@ static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { /* If we've seen traffic both ways, this is a GRE connection. diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index aa630c561361..80faf04ddf15 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -306,7 +306,6 @@ static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { enum sctp_conntrack new_state, old_state; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index dced574f6006..8f283294d70f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -493,8 +493,7 @@ static bool tcp_in_window(const struct nf_conn *ct, unsigned int index, const struct sk_buff *skb, unsigned int dataoff, - const struct tcphdr *tcph, - u_int8_t pf) + const struct tcphdr *tcph) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); @@ -801,7 +800,6 @@ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { struct net *net = nf_ct_net(ct); @@ -1013,7 +1011,7 @@ static int tcp_packet(struct nf_conn *ct, } if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, - skb, dataoff, th, pf)) { + skb, dataoff, th)) { spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index fc20cf430251..3a5f727103af 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -73,7 +73,6 @@ static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { /* If we've seen traffic both ways, this is some kind of UDP -- cgit v1.2.3 From 80055dab5de0c8677bc148c4717ddfc753a9148e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 12 Oct 2017 01:13:50 +0200 Subject: netfilter: x_tables: make xt_replace_table wait until old rules are not used anymore xt_replace_table relies on table replacement counter retrieval (which uses xt_recseq to synchronize pcpu counters). This is fine, however with large rule set get_counters() can take a very long time -- it needs to synchronize all counters because it has to assume concurrent modifications can occur. Make xt_replace_table synchronize by itself by waiting until all cpus had an even seqcount. This allows a followup patch to copy the counters of the old ruleset without any synchonization after xt_replace_table has completed. Cc: Dan Williams Reviewed-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c83a3b5e1c6c..a164e5123d59 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1153,6 +1153,7 @@ xt_replace_table(struct xt_table *table, int *error) { struct xt_table_info *private; + unsigned int cpu; int ret; ret = xt_jumpstack_alloc(newinfo); @@ -1182,14 +1183,28 @@ xt_replace_table(struct xt_table *table, smp_wmb(); table->private = newinfo; + /* make sure all cpus see new ->private value */ + smp_wmb(); + /* * Even though table entries have now been swapped, other CPU's - * may still be using the old entries. This is okay, because - * resynchronization happens because of the locking done - * during the get_counters() routine. + * may still be using the old entries... */ local_bh_enable(); + /* ... so wait for even xt_recseq on all cpus */ + for_each_possible_cpu(cpu) { + seqcount_t *s = &per_cpu(xt_recseq, cpu); + u32 seq = raw_read_seqcount(s); + + if (seq & 1) { + do { + cond_resched(); + cpu_relax(); + } while (seq == raw_read_seqcount(s)); + } + } + #ifdef CONFIG_AUDIT if (audit_enabled) { audit_log(current->audit_context, GFP_KERNEL, -- cgit v1.2.3 From d13e7b2e65f6dfbe97b845d75741a970181b9fec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 12 Oct 2017 01:13:51 +0200 Subject: netfilter: x_tables: don't use seqlock when fetching old counters after previous commit xt_replace_table will wait until all cpus had even seqcount (i.e., no cpu is accessing old ruleset). Add a 'old' counter retrival version that doesn't synchronize counters. Its not needed, the old counters are not in use anymore at this point. This speeds up table replacement on busy systems with large tables (and many cores). Cc: Dan Williams Cc: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 22 ++++++++++++++++++++-- net/ipv4/netfilter/ip_tables.c | 23 +++++++++++++++++++++-- net/ipv6/netfilter/ip6_tables.c | 22 ++++++++++++++++++++-- 3 files changed, 61 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 9e2770fd00be..f88221aebc9d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -634,6 +634,25 @@ static void get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct arpt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; + } + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -910,8 +929,7 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 39286e543ee6..4cbe5e80f3bf 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -781,6 +781,26 @@ get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ipt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + const struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; /* macro does multi eval of i */ + } + + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -1070,8 +1090,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 01bd3ee5ebc6..f06e25065a34 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -800,6 +800,25 @@ get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ip6t_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + const struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; + } + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -1090,8 +1109,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) -- cgit v1.2.3 From 28efb0046512e8a13ed9f9bdf0d68d10bbfbe9cf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 12 Oct 2017 09:38:30 +0200 Subject: netfilter: conntrack: make l3proto trackers const previous patches removed all writes to them. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index fe374da4bc13..89af9d88ca21 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -344,7 +344,7 @@ static void ipv4_hooks_unregister(struct net *net) mutex_unlock(®ister_ipv4_hooks); } -struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { +const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index fe01dc953c56..3b80a38f62b8 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -339,7 +339,7 @@ static void ipv6_hooks_unregister(struct net *net) mutex_unlock(®ister_ipv6_hooks); } -struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { +const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, -- cgit v1.2.3 From 67704c2a05860edf8529c8271550148348737a8f Mon Sep 17 00:00:00 2001 From: Harsha Sharma Date: Fri, 13 Oct 2017 04:23:57 +0530 Subject: netfilter: nf_conntrack_h323: Remove typedef struct Remove typedef from struct as linux-kernel coding style tends to avoid using typedefs. Done using following coccinelle semantic patch @r1@ type T; @@ typedef struct { ... } T; @script:python c1@ T2; T << r1.T; @@ if T[-2:] =="_t" or T[-2:] == "_T": coccinelle.T2 = T[:-2]; else: coccinelle.T2 = T; print T, coccinelle.T2 @r2@ type r1.T; identifier c1.T2; @@ -typedef struct + T2 { ... } -T ; @r3@ type r1.T; identifier c1.T2; @@ -T +struct T2 Signed-off-by: Harsha Sharma Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_asn1.c | 80 +++++++++++++++++----------------- 1 file changed, 40 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 89b2e46925c4..7831aa1effc9 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -91,41 +91,41 @@ typedef struct field_t { } field_t; /* Bit Stream */ -typedef struct { +struct bitstr { unsigned char *buf; unsigned char *beg; unsigned char *end; unsigned char *cur; unsigned int bit; -} bitstr_t; +}; /* Tool Functions */ #define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} #define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} #define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} #define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) -static unsigned int get_len(bitstr_t *bs); -static unsigned int get_bit(bitstr_t *bs); -static unsigned int get_bits(bitstr_t *bs, unsigned int b); -static unsigned int get_bitmap(bitstr_t *bs, unsigned int b); -static unsigned int get_uint(bitstr_t *bs, int b); +static unsigned int get_len(struct bitstr *bs); +static unsigned int get_bit(struct bitstr *bs); +static unsigned int get_bits(struct bitstr *bs, unsigned int b); +static unsigned int get_bitmap(struct bitstr *bs, unsigned int b); +static unsigned int get_uint(struct bitstr *bs, int b); /* Decoder Functions */ -static int decode_nul(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_bool(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_oid(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_int(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_enum(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_bitstr(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_numstr(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_octstr(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_seq(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_seqof(bitstr_t *bs, const struct field_t *f, char *base, int level); -static int decode_choice(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_nul(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_bool(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_oid(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_int(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_enum(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_bitstr(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_numstr(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_octstr(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_seq(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_seqof(struct bitstr *bs, const struct field_t *f, char *base, int level); +static int decode_choice(struct bitstr *bs, const struct field_t *f, char *base, int level); /* Decoder Functions Vector */ -typedef int (*decoder_t)(bitstr_t *, const struct field_t *, char *, int); +typedef int (*decoder_t)(struct bitstr *, const struct field_t *, char *, int); static const decoder_t Decoders[] = { decode_nul, decode_bool, @@ -150,7 +150,7 @@ static const decoder_t Decoders[] = { * Functions ****************************************************************************/ /* Assume bs is aligned && v < 16384 */ -static unsigned int get_len(bitstr_t *bs) +static unsigned int get_len(struct bitstr *bs) { unsigned int v; @@ -166,7 +166,7 @@ static unsigned int get_len(bitstr_t *bs) } /****************************************************************************/ -static unsigned int get_bit(bitstr_t *bs) +static unsigned int get_bit(struct bitstr *bs) { unsigned int b = (*bs->cur) & (0x80 >> bs->bit); @@ -177,7 +177,7 @@ static unsigned int get_bit(bitstr_t *bs) /****************************************************************************/ /* Assume b <= 8 */ -static unsigned int get_bits(bitstr_t *bs, unsigned int b) +static unsigned int get_bits(struct bitstr *bs, unsigned int b) { unsigned int v, l; @@ -203,7 +203,7 @@ static unsigned int get_bits(bitstr_t *bs, unsigned int b) /****************************************************************************/ /* Assume b <= 32 */ -static unsigned int get_bitmap(bitstr_t *bs, unsigned int b) +static unsigned int get_bitmap(struct bitstr *bs, unsigned int b) { unsigned int v, l, shift, bytes; @@ -242,7 +242,7 @@ static unsigned int get_bitmap(bitstr_t *bs, unsigned int b) /**************************************************************************** * Assume bs is aligned and sizeof(unsigned int) == 4 ****************************************************************************/ -static unsigned int get_uint(bitstr_t *bs, int b) +static unsigned int get_uint(struct bitstr *bs, int b) { unsigned int v = 0; @@ -264,7 +264,7 @@ static unsigned int get_uint(bitstr_t *bs, int b) } /****************************************************************************/ -static int decode_nul(bitstr_t *bs, const struct field_t *f, +static int decode_nul(struct bitstr *bs, const struct field_t *f, char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -273,7 +273,7 @@ static int decode_nul(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_bool(bitstr_t *bs, const struct field_t *f, +static int decode_bool(struct bitstr *bs, const struct field_t *f, char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -285,7 +285,7 @@ static int decode_bool(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_oid(bitstr_t *bs, const struct field_t *f, +static int decode_oid(struct bitstr *bs, const struct field_t *f, char *base, int level) { int len; @@ -302,7 +302,7 @@ static int decode_oid(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_int(bitstr_t *bs, const struct field_t *f, +static int decode_int(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -346,7 +346,7 @@ static int decode_int(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_enum(bitstr_t *bs, const struct field_t *f, +static int decode_enum(struct bitstr *bs, const struct field_t *f, char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); @@ -362,7 +362,7 @@ static int decode_enum(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_bitstr(bitstr_t *bs, const struct field_t *f, +static int decode_bitstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -396,7 +396,7 @@ static int decode_bitstr(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_numstr(bitstr_t *bs, const struct field_t *f, +static int decode_numstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -414,7 +414,7 @@ static int decode_numstr(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_octstr(bitstr_t *bs, const struct field_t *f, +static int decode_octstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -463,7 +463,7 @@ static int decode_octstr(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, +static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int len; @@ -489,7 +489,7 @@ static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_seq(bitstr_t *bs, const struct field_t *f, +static int decode_seq(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int ext, bmp, i, opt, len = 0, bmp2, bmp2_len; @@ -606,7 +606,7 @@ static int decode_seq(bitstr_t *bs, const struct field_t *f, } /****************************************************************************/ -static int decode_seqof(bitstr_t *bs, const struct field_t *f, +static int decode_seqof(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int count, effective_count = 0, i, len = 0; @@ -696,7 +696,7 @@ static int decode_seqof(bitstr_t *bs, const struct field_t *f, /****************************************************************************/ -static int decode_choice(bitstr_t *bs, const struct field_t *f, +static int decode_choice(struct bitstr *bs, const struct field_t *f, char *base, int level) { unsigned int type, ext, len = 0; @@ -772,7 +772,7 @@ int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras) FNAME("RasMessage") CHOICE, 5, 24, 32, DECODE | EXT, 0, _RasMessage }; - bitstr_t bs; + struct bitstr bs; bs.buf = bs.beg = bs.cur = buf; bs.end = buf + sz; @@ -789,7 +789,7 @@ static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg, FNAME("H323-UserInformation") SEQ, 1, 2, 2, DECODE | EXT, 0, _H323_UserInformation }; - bitstr_t bs; + struct bitstr bs; bs.buf = buf; bs.beg = bs.cur = beg; @@ -808,7 +808,7 @@ int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz, FNAME("MultimediaSystemControlMessage") CHOICE, 2, 4, 4, DECODE | EXT, 0, _MultimediaSystemControlMessage }; - bitstr_t bs; + struct bitstr bs; bs.buf = bs.beg = bs.cur = buf; bs.end = buf + sz; -- cgit v1.2.3 From ce49480dba8666cba0106e8e31a942c9ce4c438a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 15 Oct 2017 11:00:34 +0200 Subject: netfilter: xt_connlimit: don't store address in the conn nodes Only stored, never read. This is a leftover from commit 7d08487777c8 ("netfilter: connlimit: use rbtree for per-host conntrack obj storage"), which added the rbtree node struct that stores the address instead. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connlimit.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index ffa8eec980e9..ce2870428631 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -46,7 +46,6 @@ struct xt_connlimit_conn { struct hlist_node node; struct nf_conntrack_tuple tuple; - union nf_inet_addr addr; }; struct xt_connlimit_rb { @@ -125,7 +124,6 @@ static bool add_hlist(struct hlist_head *head, if (conn == NULL) return false; conn->tuple = *tuple; - conn->addr = *addr; hlist_add_head(&conn->node, head); return true; } @@ -270,7 +268,6 @@ count_tree(struct net *net, struct rb_root *root, } conn->tuple = *tuple; - conn->addr = *addr; rbconn->addr = *addr; INIT_HLIST_HEAD(&rbconn->hhead); -- cgit v1.2.3 From e8daf27c2fea38e16a791780952aa5dff1c409fe Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Mon, 9 Oct 2017 07:01:14 +0200 Subject: netfilter: nf_ct_h323: Out Of Bound Read in Netfilter Conntrack Add missing counter decrement to prevent out of bounds memory read. Signed-off-by: Eric Sesterhenn Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_asn1.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 7831aa1effc9..cf1bf2605c10 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -877,6 +877,7 @@ int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931) if (sz < 1) break; len = *p++; + sz--; if (sz < len) break; p += len; -- cgit v1.2.3 From 908d140a87a794bf89717ceae54aba5ce86c52e4 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sat, 21 Oct 2017 00:25:15 +0300 Subject: ip6_tunnel: Allow rcv/xmit even if remote address is a local address Currently, ip6_tnl_xmit_ctl drops tunneled packets if the remote address (outer v6 destination) is one of host's locally configured addresses. Same applies to ip6_tnl_rcv_ctl: it drops packets if the remote address (outer v6 source) is a local address. This prevents using ipxip6 (and ip6_gre) tunnels whose local/remote endpoints are on same host; OTOH v4 tunnels (ipip or gre) allow such configurations. An example where this proves useful is a system where entities are identified by their unique v6 addresses, and use tunnels to encapsulate traffic between them. The limitation prevents placing several entities on same host. Introduce IP6_TNL_F_ALLOW_LOCAL_REMOTE which allows to bypass this restriction. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4212879ff35e..439d65f7e094 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -770,7 +770,8 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr(net, laddr, ldev, 0))) && - likely(!ipv6_chk_addr(net, raddr, NULL, 0))) + ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || + likely(!ipv6_chk_addr(net, raddr, NULL, 0)))) ret = 1; } return ret; @@ -1000,7 +1001,8 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0))) pr_warn("%s xmit: Local address not yet configured!\n", p->name); - else if (!ipv6_addr_is_multicast(raddr) && + else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && + !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr(net, raddr, NULL, 0))) pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", p->name); -- cgit v1.2.3 From fbb85b3c0112e6aec82863358bdb1b25c7edaec1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 24 Oct 2017 13:54:17 +0800 Subject: bridge: remove rtmsg_ifinfo called in add_del_if Since commit dc709f375743 ("rtnetlink: bring NETDEV_CHANGEUPPER event process back in rtnetlink_event"), rtnetlink_event would process the NETDEV_CHANGEUPPER event and send a notification to userspace. In add_del_if, it would generate NETDEV_CHANGEUPPER event by whether netdev_master_upper_dev_link or netdev_upper_dev_unlink. There's no need to call rtmsg_ifinfo to send the notification any more. So this patch is to remove it from add_del_if also to avoid redundant notifications. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 8f29103935a3..16d01a3ff33f 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -102,9 +102,6 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) else ret = br_del_if(br, dev); - if (!ret) - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); - return ret; } -- cgit v1.2.3 From eeda3fb9e132bd5f9592c4c664ba944cf1ac5b9f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 24 Oct 2017 13:54:19 +0800 Subject: rtnetlink: bring NETDEV_CHANGELOWERSTATE event process back to rtnetlink_event This patch is to bring NETDEV_CHANGELOWERSTATE event process back to rtnetlink_event so that bonding could use it instead of calling rtmsg_ifinfo to send a notification to userspace after netdev lower state is changed in the later patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index df8dba998c48..854a848842ea 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4385,6 +4385,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_CHANGEUPPER: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: + case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), GFP_KERNEL, NULL); -- cgit v1.2.3 From ef5201c83d1400570a3b6f004ad7a23d71934411 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 24 Oct 2017 13:54:20 +0800 Subject: bonding: remove rtmsg_ifinfo called after bond_lower_state_changed After the patch 'rtnetlink: bring NETDEV_CHANGELOWERSTATE event process back to rtnetlink_event', bond_lower_state_changed would generate NETDEV_CHANGEUPPER event which would send a notification to userspace in rtnetlink_event. There's no need to call rtmsg_ifinfo to send the notification any more. So this patch is to remove it from these places after bond_lower_state_changed. Besides, after this, rtmsg_ifinfo is not needed to be exported. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 854a848842ea..de24d394c69e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2989,7 +2989,6 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL); } -EXPORT_SYMBOL(rtmsg_ifinfo); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid) -- cgit v1.2.3 From 9c3b57518363577d4e2ea1964ef4fa03e100beaa Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:45:31 -0700 Subject: net: sctp: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Vlad Yasevich Cc: Neil Horman Cc: "David S. Miller" Cc: linux-sctp@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/sctp/associola.c | 3 +- net/sctp/protocol.c | 7 ++-- net/sctp/sm_sideeffect.c | 85 +++++++++++++++++++++++++++++------------------- net/sctp/transport.c | 13 +++----- 4 files changed, 61 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index dfb9651e818b..69394f4d6091 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -149,8 +149,7 @@ static struct sctp_association *sctp_association_init( /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) - setup_timer(&asoc->timers[i], sctp_timer_events[i], - (unsigned long)asoc); + timer_setup(&asoc->timers[i], sctp_timer_events[i], 0); /* Pull default initialization values from the sock options. * Note: This assumes that the values have already been diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index fcd80feb293f..f5172c21349b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -622,9 +622,9 @@ static void sctp_v4_ecn_capable(struct sock *sk) INET_ECN_xmit(sk); } -static void sctp_addr_wq_timeout_handler(unsigned long arg) +static void sctp_addr_wq_timeout_handler(struct timer_list *t) { - struct net *net = (struct net *)arg; + struct net *net = from_timer(net, t, sctp.addr_wq_timer); struct sctp_sockaddr_entry *addrw, *temp; struct sctp_sock *sp; @@ -1304,8 +1304,7 @@ static int __net_init sctp_defaults_init(struct net *net) INIT_LIST_HEAD(&net->sctp.auto_asconf_splist); spin_lock_init(&net->sctp.addr_wq_lock); net->sctp.addr_wq_timer.expires = 0; - setup_timer(&net->sctp.addr_wq_timer, sctp_addr_wq_timeout_handler, - (unsigned long)net); + timer_setup(&net->sctp.addr_wq_timer, sctp_addr_wq_timeout_handler, 0); return 0; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 402bfbb888cd..1c2699b424af 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -243,9 +243,10 @@ nomem: /* When the T3-RTX timer expires, it calls this function to create the * relevant state machine event. */ -void sctp_generate_t3_rtx_event(unsigned long peer) +void sctp_generate_t3_rtx_event(struct timer_list *t) { - struct sctp_transport *transport = (struct sctp_transport *) peer; + struct sctp_transport *transport = + from_timer(transport, t, T3_rtx_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); @@ -319,50 +320,63 @@ out_unlock: sctp_association_put(asoc); } -static void sctp_generate_t1_cookie_event(unsigned long data) +static void sctp_generate_t1_cookie_event(struct timer_list *t) { - struct sctp_association *asoc = (struct sctp_association *) data; + struct sctp_association *asoc = + from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_COOKIE]); + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE); } -static void sctp_generate_t1_init_event(unsigned long data) +static void sctp_generate_t1_init_event(struct timer_list *t) { - struct sctp_association *asoc = (struct sctp_association *) data; + struct sctp_association *asoc = + from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_INIT]); + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT); } -static void sctp_generate_t2_shutdown_event(unsigned long data) +static void sctp_generate_t2_shutdown_event(struct timer_list *t) { - struct sctp_association *asoc = (struct sctp_association *) data; + struct sctp_association *asoc = + from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN]); + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN); } -static void sctp_generate_t4_rto_event(unsigned long data) +static void sctp_generate_t4_rto_event(struct timer_list *t) { - struct sctp_association *asoc = (struct sctp_association *) data; + struct sctp_association *asoc = + from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T4_RTO]); + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO); } -static void sctp_generate_t5_shutdown_guard_event(unsigned long data) +static void sctp_generate_t5_shutdown_guard_event(struct timer_list *t) { - struct sctp_association *asoc = (struct sctp_association *)data; + struct sctp_association *asoc = + from_timer(asoc, t, + timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]); + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD); } /* sctp_generate_t5_shutdown_guard_event() */ -static void sctp_generate_autoclose_event(unsigned long data) +static void sctp_generate_autoclose_event(struct timer_list *t) { - struct sctp_association *asoc = (struct sctp_association *) data; + struct sctp_association *asoc = + from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]); + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE); } /* Generate a heart beat event. If the sock is busy, reschedule. Make * sure that the transport is still valid. */ -void sctp_generate_heartbeat_event(unsigned long data) +void sctp_generate_heartbeat_event(struct timer_list *t) { - struct sctp_transport *transport = (struct sctp_transport *) data; + struct sctp_transport *transport = from_timer(transport, t, hb_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); @@ -405,9 +419,10 @@ out_unlock: /* Handle the timeout of the ICMP protocol unreachable timer. Trigger * the correct state machine transition that will close the association. */ -void sctp_generate_proto_unreach_event(unsigned long data) +void sctp_generate_proto_unreach_event(struct timer_list *t) { - struct sctp_transport *transport = (struct sctp_transport *)data; + struct sctp_transport *transport = + from_timer(transport, t, proto_unreach_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); @@ -439,9 +454,10 @@ out_unlock: } /* Handle the timeout of the RE-CONFIG timer. */ -void sctp_generate_reconf_event(unsigned long data) +void sctp_generate_reconf_event(struct timer_list *t) { - struct sctp_transport *transport = (struct sctp_transport *)data; + struct sctp_transport *transport = + from_timer(transport, t, reconf_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); @@ -471,24 +487,27 @@ out_unlock: } /* Inject a SACK Timeout event into the state machine. */ -static void sctp_generate_sack_event(unsigned long data) +static void sctp_generate_sack_event(struct timer_list *t) { - struct sctp_association *asoc = (struct sctp_association *)data; + struct sctp_association *asoc = + from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_SACK]); + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK); } sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { - NULL, - sctp_generate_t1_cookie_event, - sctp_generate_t1_init_event, - sctp_generate_t2_shutdown_event, - NULL, - sctp_generate_t4_rto_event, - sctp_generate_t5_shutdown_guard_event, - NULL, - NULL, - sctp_generate_sack_event, - sctp_generate_autoclose_event, + [SCTP_EVENT_TIMEOUT_NONE] = NULL, + [SCTP_EVENT_TIMEOUT_T1_COOKIE] = sctp_generate_t1_cookie_event, + [SCTP_EVENT_TIMEOUT_T1_INIT] = sctp_generate_t1_init_event, + [SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = sctp_generate_t2_shutdown_event, + [SCTP_EVENT_TIMEOUT_T3_RTX] = NULL, + [SCTP_EVENT_TIMEOUT_T4_RTO] = sctp_generate_t4_rto_event, + [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] = + sctp_generate_t5_shutdown_guard_event, + [SCTP_EVENT_TIMEOUT_HEARTBEAT] = NULL, + [SCTP_EVENT_TIMEOUT_RECONF] = NULL, + [SCTP_EVENT_TIMEOUT_SACK] = sctp_generate_sack_event, + [SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sctp_generate_autoclose_event, }; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 2d9bd3776bc8..1e5a22430cf5 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -87,14 +87,11 @@ static struct sctp_transport *sctp_transport_init(struct net *net, INIT_LIST_HEAD(&peer->send_ready); INIT_LIST_HEAD(&peer->transports); - setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, - (unsigned long)peer); - setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event, - (unsigned long)peer); - setup_timer(&peer->reconf_timer, sctp_generate_reconf_event, - (unsigned long)peer); - setup_timer(&peer->proto_unreach_timer, - sctp_generate_proto_unreach_event, (unsigned long)peer); + timer_setup(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 0); + timer_setup(&peer->hb_timer, sctp_generate_heartbeat_event, 0); + timer_setup(&peer->reconf_timer, sctp_generate_reconf_event, 0); + timer_setup(&peer->proto_unreach_timer, + sctp_generate_proto_unreach_event, 0); /* Initialize the 64-bit random nonce sent with heartbeat. */ get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); -- cgit v1.2.3 From 8dbd05ff5c4e50a3e5b1ed4089c2d0b4210379c6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:45:39 -0700 Subject: net: ax25: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Joerg Reuter Cc: Ralf Baechle Cc: "David S. Miller" Cc: linux-hams@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 7 +++---- net/ax25/ax25_ds_timer.c | 9 ++++----- net/ax25/ax25_timer.c | 41 ++++++++++++++++++++--------------------- 3 files changed, 27 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index f3f9d18891de..06eac1f50c5e 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -268,9 +268,9 @@ void ax25_destroy_socket(ax25_cb *); /* * Handler for deferred kills. */ -static void ax25_destroy_timer(unsigned long data) +static void ax25_destroy_timer(struct timer_list *t) { - ax25_cb *ax25=(ax25_cb *)data; + ax25_cb *ax25 = from_timer(ax25, t, dtimer); struct sock *sk; sk=ax25->sk; @@ -326,8 +326,7 @@ void ax25_destroy_socket(ax25_cb *ax25) if (ax25->sk != NULL) { if (sk_has_allocations(ax25->sk)) { /* Defer: outstanding buffers */ - setup_timer(&ax25->dtimer, ax25_destroy_timer, - (unsigned long)ax25); + timer_setup(&ax25->dtimer, ax25_destroy_timer, 0); ax25->dtimer.expires = jiffies + 2 * HZ; add_timer(&ax25->dtimer); } else { diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index 5fb2104b7304..e9d11313d45b 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -29,7 +29,7 @@ #include #include -static void ax25_ds_timeout(unsigned long); +static void ax25_ds_timeout(struct timer_list *); /* * Add DAMA slave timeout timer to timer list. @@ -41,8 +41,7 @@ static void ax25_ds_timeout(unsigned long); void ax25_ds_setup_timer(ax25_dev *ax25_dev) { - setup_timer(&ax25_dev->dama.slave_timer, ax25_ds_timeout, - (unsigned long)ax25_dev); + timer_setup(&ax25_dev->dama.slave_timer, ax25_ds_timeout, 0); } void ax25_ds_del_timer(ax25_dev *ax25_dev) @@ -66,9 +65,9 @@ void ax25_ds_set_timer(ax25_dev *ax25_dev) * Silently discard all (slave) connections in case our master forgot us... */ -static void ax25_ds_timeout(unsigned long arg) +static void ax25_ds_timeout(struct timer_list *t) { - ax25_dev *ax25_dev = (struct ax25_dev *) arg; + ax25_dev *ax25_dev = from_timer(ax25_dev, t, dama.slave_timer); ax25_cb *ax25; if (ax25_dev == NULL || !ax25_dev->dama.slave) diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c index 23a6f38a80bf..c47b7ee1e4da 100644 --- a/net/ax25/ax25_timer.c +++ b/net/ax25/ax25_timer.c @@ -33,20 +33,19 @@ #include #include -static void ax25_heartbeat_expiry(unsigned long); -static void ax25_t1timer_expiry(unsigned long); -static void ax25_t2timer_expiry(unsigned long); -static void ax25_t3timer_expiry(unsigned long); -static void ax25_idletimer_expiry(unsigned long); +static void ax25_heartbeat_expiry(struct timer_list *); +static void ax25_t1timer_expiry(struct timer_list *); +static void ax25_t2timer_expiry(struct timer_list *); +static void ax25_t3timer_expiry(struct timer_list *); +static void ax25_idletimer_expiry(struct timer_list *); void ax25_setup_timers(ax25_cb *ax25) { - setup_timer(&ax25->timer, ax25_heartbeat_expiry, (unsigned long)ax25); - setup_timer(&ax25->t1timer, ax25_t1timer_expiry, (unsigned long)ax25); - setup_timer(&ax25->t2timer, ax25_t2timer_expiry, (unsigned long)ax25); - setup_timer(&ax25->t3timer, ax25_t3timer_expiry, (unsigned long)ax25); - setup_timer(&ax25->idletimer, ax25_idletimer_expiry, - (unsigned long)ax25); + timer_setup(&ax25->timer, ax25_heartbeat_expiry, 0); + timer_setup(&ax25->t1timer, ax25_t1timer_expiry, 0); + timer_setup(&ax25->t2timer, ax25_t2timer_expiry, 0); + timer_setup(&ax25->t3timer, ax25_t3timer_expiry, 0); + timer_setup(&ax25->idletimer, ax25_idletimer_expiry, 0); } void ax25_start_heartbeat(ax25_cb *ax25) @@ -120,10 +119,10 @@ unsigned long ax25_display_timer(struct timer_list *timer) EXPORT_SYMBOL(ax25_display_timer); -static void ax25_heartbeat_expiry(unsigned long param) +static void ax25_heartbeat_expiry(struct timer_list *t) { int proto = AX25_PROTO_STD_SIMPLEX; - ax25_cb *ax25 = (ax25_cb *)param; + ax25_cb *ax25 = from_timer(ax25, t, timer); if (ax25->ax25_dev) proto = ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]; @@ -145,9 +144,9 @@ static void ax25_heartbeat_expiry(unsigned long param) } } -static void ax25_t1timer_expiry(unsigned long param) +static void ax25_t1timer_expiry(struct timer_list *t) { - ax25_cb *ax25 = (ax25_cb *)param; + ax25_cb *ax25 = from_timer(ax25, t, t1timer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: @@ -164,9 +163,9 @@ static void ax25_t1timer_expiry(unsigned long param) } } -static void ax25_t2timer_expiry(unsigned long param) +static void ax25_t2timer_expiry(struct timer_list *t) { - ax25_cb *ax25 = (ax25_cb *)param; + ax25_cb *ax25 = from_timer(ax25, t, t2timer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: @@ -183,9 +182,9 @@ static void ax25_t2timer_expiry(unsigned long param) } } -static void ax25_t3timer_expiry(unsigned long param) +static void ax25_t3timer_expiry(struct timer_list *t) { - ax25_cb *ax25 = (ax25_cb *)param; + ax25_cb *ax25 = from_timer(ax25, t, t3timer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: @@ -204,9 +203,9 @@ static void ax25_t3timer_expiry(unsigned long param) } } -static void ax25_idletimer_expiry(unsigned long param) +static void ax25_idletimer_expiry(struct timer_list *t) { - ax25_cb *ax25 = (ax25_cb *)param; + ax25_cb *ax25 = from_timer(ax25, t, idletimer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: -- cgit v1.2.3 From fc8bcaa05160528d56432e4612f522e3ceafc513 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:45:48 -0700 Subject: net: LLC: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Hans Liljestrand Cc: "Paul E. McKenney" Cc: "Reshetova, Elena" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/llc/llc_c_ac.c | 27 +++++++++++++++++---------- net/llc/llc_conn.c | 12 ++++-------- 2 files changed, 21 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index ea225bd2672c..f59648018060 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -1318,9 +1318,8 @@ static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb) return 0; } -static void llc_conn_tmr_common_cb(unsigned long timeout_data, u8 type) +static void llc_conn_tmr_common_cb(struct sock *sk, u8 type) { - struct sock *sk = (struct sock *)timeout_data; struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); bh_lock_sock(sk); @@ -1334,24 +1333,32 @@ static void llc_conn_tmr_common_cb(unsigned long timeout_data, u8 type) bh_unlock_sock(sk); } -void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data) +void llc_conn_pf_cycle_tmr_cb(struct timer_list *t) { - llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_P_TMR); + struct llc_sock *llc = from_timer(llc, t, pf_cycle_timer.timer); + + llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_P_TMR); } -void llc_conn_busy_tmr_cb(unsigned long timeout_data) +void llc_conn_busy_tmr_cb(struct timer_list *t) { - llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_BUSY_TMR); + struct llc_sock *llc = from_timer(llc, t, busy_state_timer.timer); + + llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_BUSY_TMR); } -void llc_conn_ack_tmr_cb(unsigned long timeout_data) +void llc_conn_ack_tmr_cb(struct timer_list *t) { - llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_ACK_TMR); + struct llc_sock *llc = from_timer(llc, t, ack_timer.timer); + + llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_ACK_TMR); } -void llc_conn_rej_tmr_cb(unsigned long timeout_data) +void llc_conn_rej_tmr_cb(struct timer_list *t) { - llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_REJ_TMR); + struct llc_sock *llc = from_timer(llc, t, rej_sent_timer.timer); + + llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_REJ_TMR); } int llc_conn_ac_rst_vs(struct sock *sk, struct sk_buff *skb) diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 5e91b47f0d2a..9177dbb16dce 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -902,20 +902,16 @@ static void llc_sk_init(struct sock *sk) llc->inc_cntr = llc->dec_cntr = 2; llc->dec_step = llc->connect_step = 1; - setup_timer(&llc->ack_timer.timer, llc_conn_ack_tmr_cb, - (unsigned long)sk); + timer_setup(&llc->ack_timer.timer, llc_conn_ack_tmr_cb, 0); llc->ack_timer.expire = sysctl_llc2_ack_timeout; - setup_timer(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb, - (unsigned long)sk); + timer_setup(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb, 0); llc->pf_cycle_timer.expire = sysctl_llc2_p_timeout; - setup_timer(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb, - (unsigned long)sk); + timer_setup(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb, 0); llc->rej_sent_timer.expire = sysctl_llc2_rej_timeout; - setup_timer(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb, - (unsigned long)sk); + timer_setup(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb, 0); llc->busy_state_timer.expire = sysctl_llc2_busy_timeout; llc->n2 = 2; /* max retransmit */ -- cgit v1.2.3 From 839a6094140ade97ccc548fcd63179506c5d7fe4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:46:09 -0700 Subject: net: dccp: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Adds a pointer back to the sock. Cc: Gerrit Renker Cc: "David S. Miller" Cc: Soheil Hassas Yeganeh Cc: Hannes Frederic Sowa Cc: Eric Dumazet Cc: dccp@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 10 +++++----- net/dccp/ccids/ccid2.h | 1 + net/dccp/ccids/ccid3.c | 11 ++++++----- net/dccp/ccids/ccid3.h | 1 + net/dccp/timer.c | 12 +++++++----- 5 files changed, 20 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index e1295d5f2c56..1c75cd1255f6 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -126,10 +126,10 @@ static void ccid2_change_l_seq_window(struct sock *sk, u64 val) DCCPF_SEQ_WMAX)); } -static void ccid2_hc_tx_rto_expire(unsigned long data) +static void ccid2_hc_tx_rto_expire(struct timer_list *t) { - struct sock *sk = (struct sock *)data; - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer); + struct sock *sk = hc->sk; const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); bh_lock_sock(sk); @@ -733,8 +733,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hc->tx_rpdupack = -1; hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; hc->tx_cwnd_used = 0; - setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, - (unsigned long)sk); + hc->sk = sk; + timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0); INIT_LIST_HEAD(&hc->tx_av_chunks); return 0; } diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 6e50ef2898fb..1af0116dc6ce 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -85,6 +85,7 @@ struct ccid2_hc_tx_sock { tx_rto; u64 tx_rtt_seq:48; struct timer_list tx_rtotimer; + struct sock *sk; /* Congestion Window validation (optional, RFC 2861) */ u32 tx_cwnd_used, diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 119c04317d48..8b5ba6dffac7 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -195,10 +195,10 @@ static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc, } } -static void ccid3_hc_tx_no_feedback_timer(unsigned long data) +static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer); + struct sock *sk = hc->sk; unsigned long t_nfb = USEC_PER_SEC / 5; bh_lock_sock(sk); @@ -505,8 +505,9 @@ static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) hc->tx_state = TFRC_SSTATE_NO_SENT; hc->tx_hist = NULL; - setup_timer(&hc->tx_no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); + hc->sk = sk; + timer_setup(&hc->tx_no_feedback_timer, + ccid3_hc_tx_no_feedback_timer, 0); return 0; } diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 1a9933c29672..813d91c6e1e2 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -106,6 +106,7 @@ struct ccid3_hc_tx_sock { u8 tx_last_win_count; ktime_t tx_t_last_win_count; struct timer_list tx_no_feedback_timer; + struct sock *sk; ktime_t tx_t_ld; ktime_t tx_t_nom; struct tfrc_tx_hist_entry *tx_hist; diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 1e35526bf436..b50a8732ff43 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -234,10 +234,13 @@ static void dccp_write_xmitlet(unsigned long data) bh_unlock_sock(sk); } -static void dccp_write_xmit_timer(unsigned long data) +static void dccp_write_xmit_timer(struct timer_list *t) { - dccp_write_xmitlet(data); - sock_put((struct sock *)data); + struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); + struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; + + dccp_write_xmitlet((unsigned long)sk); + sock_put(sk); } void dccp_init_xmit_timers(struct sock *sk) @@ -245,8 +248,7 @@ void dccp_init_xmit_timers(struct sock *sk) struct dccp_sock *dp = dccp_sk(sk); tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, - (unsigned long)sk); + timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } -- cgit v1.2.3 From dda436b7accffd5313e8fd8338f724376c6fcda2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:46:16 -0700 Subject: net: hsr: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Arvid Brodin Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 9 ++++----- net/hsr/hsr_framereg.c | 6 ++---- net/hsr/hsr_framereg.h | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 172d8309f89e..b8cd43c9ed5b 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -328,12 +328,12 @@ out: /* Announce (supervision frame) timer function */ -static void hsr_announce(unsigned long data) +static void hsr_announce(struct timer_list *t) { struct hsr_priv *hsr; struct hsr_port *master; - hsr = (struct hsr_priv *) data; + hsr = from_timer(hsr, t, announce_timer); rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); @@ -463,9 +463,8 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], hsr->sequence_nr = HSR_SEQNR_START; hsr->sup_sequence_nr = HSR_SUP_SEQNR_START; - setup_timer(&hsr->announce_timer, hsr_announce, (unsigned long)hsr); - - setup_timer(&hsr->prune_timer, hsr_prune_nodes, (unsigned long)hsr); + timer_setup(&hsr->announce_timer, hsr_announce, 0); + timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0); ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 284a9b820df8..286ceb41ac0c 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -365,16 +365,14 @@ static struct hsr_port *get_late_port(struct hsr_priv *hsr, /* Remove stale sequence_nr records. Called by timer every * HSR_LIFE_CHECK_INTERVAL (two seconds or so). */ -void hsr_prune_nodes(unsigned long data) +void hsr_prune_nodes(struct timer_list *t) { - struct hsr_priv *hsr; + struct hsr_priv *hsr = from_timer(hsr, t, prune_timer); struct hsr_node *node; struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; - hsr = (struct hsr_priv *) data; - rcu_read_lock(); list_for_each_entry_rcu(node, &hsr->node_db, mac_list) { /* Shorthand */ diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 4e04f0e868e9..370b45998121 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -33,7 +33,7 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, u16 sequence_nr); -void hsr_prune_nodes(unsigned long data); +void hsr_prune_nodes(struct timer_list *t); int hsr_create_self_node(struct list_head *self_node_db, unsigned char addr_a[ETH_ALEN], -- cgit v1.2.3 From 17bfd8c89fb62219fa27a10b9475daea74d20a15 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:46:26 -0700 Subject: net: af_packet: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Willem de Bruijn Cc: Mike Maloney Cc: Jarno Rajahalme Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/packet/af_packet.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 4f4fa323171d..9603f6ff17a4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -201,11 +201,8 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *, static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); -static void prb_retire_rx_blk_timer_expired(unsigned long); +static void prb_retire_rx_blk_timer_expired(struct timer_list *); static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); -static void prb_init_blk_timer(struct packet_sock *, - struct tpacket_kbdq_core *, - void (*func) (unsigned long)); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); @@ -540,20 +537,14 @@ static void prb_shutdown_retire_blk_timer(struct packet_sock *po, prb_del_retire_blk_timer(pkc); } -static void prb_init_blk_timer(struct packet_sock *po, - struct tpacket_kbdq_core *pkc, - void (*func) (unsigned long)) -{ - setup_timer(&pkc->retire_blk_timer, func, (long)po); - pkc->retire_blk_timer.expires = jiffies; -} - static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); + timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, + 0); + pkc->retire_blk_timer.expires = jiffies; } static int prb_calc_retire_blk_tmo(struct packet_sock *po, @@ -671,9 +662,10 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) * prb_calc_retire_blk_tmo() calculates the tmo. * */ -static void prb_retire_rx_blk_timer_expired(unsigned long data) +static void prb_retire_rx_blk_timer_expired(struct timer_list *t) { - struct packet_sock *po = (struct packet_sock *)data; + struct packet_sock *po = + from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; -- cgit v1.2.3 From 5c658e19a7b53b6d712c3fb08bb88649d362db9f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:46:45 -0700 Subject: net: atm/mpc: Stop using open-coded timer .data field In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using an explicit static variable to hold additional expiration details. Cc: "David S. Miller" Cc: Bhumika Goyal Cc: Andrew Morton Cc: Alexey Dobriyan Cc: "Reshetova, Elena" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/atm/mpc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/atm/mpc.c b/net/atm/mpc.c index b43d99430eb6..883d25778fa4 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -95,7 +95,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned long event, void *dev); static void mpc_timer_refresh(void); -static void mpc_cache_check(unsigned long checking_time); +static void mpc_cache_check(struct timer_list *unused); static struct llc_snap_hdr llc_snap_mpoa_ctrl = { 0xaa, 0xaa, 0x03, @@ -1407,15 +1407,17 @@ static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action) msg_to_mpoad(msg, mpc); } +static unsigned long checking_time; + static void mpc_timer_refresh(void) { mpc_timer.expires = jiffies + (MPC_P2 * HZ); - mpc_timer.data = mpc_timer.expires; - mpc_timer.function = mpc_cache_check; + checking_time = mpc_timer.expires; + mpc_timer.function = (TIMER_FUNC_TYPE)mpc_cache_check; add_timer(&mpc_timer); } -static void mpc_cache_check(unsigned long checking_time) +static void mpc_cache_check(struct timer_list *unused) { struct mpoa_client *mpc = mpcs; static unsigned long previous_resolving_check_time; -- cgit v1.2.3 From f65163fed0e7dc3c79be4f96a8fe97fc89328b93 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 25 Oct 2017 16:19:52 +0200 Subject: tipc: eliminate KASAN warning The following warning was reported by syzbot on Oct 24. 2017: KASAN: slab-out-of-bounds Read in tipc_nametbl_lookup_dst_nodes This is a harmless bug, but we still want to get rid of the warning, so we swap the two conditions in question. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 2856e19e036e..b3829bcf63c7 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -697,7 +697,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, spin_lock_bh(&seq->lock); sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); stop = seq->sseqs + seq->first_free; - for (; sseq->lower <= upper && sseq != stop; sseq++) { + for (; sseq != stop && sseq->lower <= upper; sseq++) { info = sseq->info; list_for_each_entry(publ, &info->zone_list, zone_list) { if (tipc_in_scope(domain, publ->node)) -- cgit v1.2.3 From 8e8ef50bb4246d6fee9971994a6d846d9bc7ff4c Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 23 Oct 2017 14:17:30 -0400 Subject: net: dsa: legacy: don't unmask port bitmaps The legacy code does not unmask the cpu_port_mask and dsa_port_mask as stated. But this is done on the error path and those masks won't be used after that. So instead of fixing the bit operation, simply remove it. Fixes: 83c0afaec7b7 ("net: dsa: Add new binding implementation") Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/legacy.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index b6c88fd33d4f..93c1c43bcc58 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -272,10 +272,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; dsa_cpu_dsa_destroy(&ds->ports[port]); - - /* Clearing a bit which is not set does no harm */ - ds->cpu_port_mask |= ~(1 << port); - ds->dsa_port_mask |= ~(1 << port); } if (ds->slave_mii_bus && ds->ops->phy_read) -- cgit v1.2.3 From eaac97466ee4e0ab6ed67afbf9fc400862e64986 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 23 Oct 2017 14:17:31 -0400 Subject: net: dsa: don't unmask port bitmaps The unapply functions are called on the error path. As for dsa_port_mask, enabled_port_mask and cpu_port_mask won't be used after so there's no need to unmask the corresponding port bit from them. This makes dsa_cpu_port_unapply() and dsa_dsa_port_unapply() identical, which can be factorized later. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 9e8b8aab049d..62485a57dbfc 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -260,8 +260,6 @@ static void dsa_cpu_port_unapply(struct dsa_port *port) { devlink_port_unregister(&port->devlink_port); dsa_cpu_dsa_destroy(port); - port->ds->cpu_port_mask &= ~BIT(port->index); - } static int dsa_user_port_apply(struct dsa_port *port) @@ -300,7 +298,6 @@ static void dsa_user_port_unapply(struct dsa_port *port) if (port->slave) { dsa_slave_destroy(port->slave); port->slave = NULL; - port->ds->enabled_port_mask &= ~(1 << port->index); } } @@ -512,7 +509,6 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); - ds->cpu_port_mask &= ~BIT(index); return PTR_ERR(tag_ops); } -- cgit v1.2.3 From c1eef220c1760762753b602c382127bfccee226d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 24 Oct 2017 15:30:37 -0700 Subject: vsock: always call vsock_init_tables() Although CONFIG_VSOCKETS_DIAG depends on CONFIG_VSOCKETS, vsock_init_tables() is not always called, it is called only if other modules call its caller. Therefore if we only enable CONFIG_VSOCKETS_DIAG, it would crash kernel on uninitialized vsock_bind_table. This patch fixes it by moving vsock_init_tables() to its own module_init(). Fixes: 413a4317aca7 ("VSOCK: add sock_diag interface") Reported-by: syzkaller bot Cc: Stefan Hajnoczi Cc: Jorgen Hansen Signed-off-by: Cong Wang Reviewed-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 98359c19522f..5d28abf87fbf 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -195,7 +195,7 @@ static int vsock_auto_bind(struct vsock_sock *vsk) return __vsock_bind(sk, &local_addr); } -static void vsock_init_tables(void) +static int __init vsock_init_tables(void) { int i; @@ -204,6 +204,7 @@ static void vsock_init_tables(void) for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) INIT_LIST_HEAD(&vsock_connected_table[i]); + return 0; } static void __vsock_insert_bound(struct list_head *list, @@ -1957,8 +1958,6 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner) vsock_proto.owner = owner; transport = t; - vsock_init_tables(); - vsock_device.minor = MISC_DYNAMIC_MINOR; err = misc_register(&vsock_device); if (err) { @@ -2019,6 +2018,8 @@ const struct vsock_transport *vsock_core_get_transport(void) } EXPORT_SYMBOL_GPL(vsock_core_get_transport); +module_init(vsock_init_tables); + MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); MODULE_VERSION("1.0.2.0-k"); -- cgit v1.2.3 From e233df01576bba9f5bafacccd571353b72152bd5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 24 Oct 2017 15:44:49 -0700 Subject: tipc: fix a dangling pointer tsk->group is set to grp earlier, but we forget to unset it after grp is freed. Fixes: 75da2163dbb6 ("tipc: introduce communication groups") Reported-by: syzkaller bot Cc: Jon Maloy Cc: Ying Xue Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b3b72d8e9543..ea61c32f6b80 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2756,8 +2756,10 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) seq.upper = seq.lower; tipc_nametbl_build_group(net, grp, mreq->type, domain); rc = tipc_sk_publish(tsk, mreq->scope, &seq); - if (rc) + if (rc) { tipc_group_delete(net, grp); + tsk->group = NULL; + } /* Eliminate any risk that a broadcast overtakes the sent JOIN */ tsk->mc_method.rcast = true; -- cgit v1.2.3 From 145686baab68e9c7594fe9269f47da479c25ad79 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 25 Oct 2017 11:01:44 +0200 Subject: smc: fix mutex unlocks during link group creation Link group creation is synchronized with the smc_create_lgr_pending lock. In smc_listen_work() this mutex is sometimes unlocked, even though it has not been locked before. This issue will surface in presence of the SMC rendezvous code. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 745f145d4c4d..70f7640f5090 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -808,7 +808,7 @@ static void smc_listen_work(struct work_struct *work) rc = local_contact; if (rc == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - goto decline_rdma; + goto decline_rdma_unlock; } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; @@ -816,7 +816,7 @@ static void smc_listen_work(struct work_struct *work) rc = smc_buf_create(new_smc); if (rc) { reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma; + goto decline_rdma_unlock; } smc_close_init(new_smc); @@ -831,7 +831,7 @@ static void smc_listen_work(struct work_struct *work) buf_desc->mr_rx[SMC_SINGLE_LINK]); if (rc) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma; + goto decline_rdma_unlock; } } } @@ -839,15 +839,15 @@ static void smc_listen_work(struct work_struct *work) rc = smc_clc_send_accept(new_smc, local_contact); if (rc) - goto out_err; + goto out_err_unlock; /* receive SMC Confirm CLC message */ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), SMC_CLC_CONFIRM); if (reason_code < 0) - goto out_err; + goto out_err_unlock; if (reason_code > 0) - goto decline_rdma; + goto decline_rdma_unlock; smc_conn_save_peer_info(new_smc, &cclc); if (local_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, &cclc); @@ -855,34 +855,34 @@ static void smc_listen_work(struct work_struct *work) rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); if (rc) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma; + goto decline_rdma_unlock; } if (local_contact == SMC_FIRST_CONTACT) { rc = smc_ib_ready_link(link); if (rc) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma; + goto decline_rdma_unlock; } /* QP confirmation over RoCE fabric */ reason_code = smc_serv_conf_first_link(new_smc); if (reason_code < 0) { /* peer is not aware of a problem */ rc = reason_code; - goto out_err; + goto out_err_unlock; } if (reason_code > 0) - goto decline_rdma; + goto decline_rdma_unlock; } smc_tx_init(new_smc); + mutex_unlock(&smc_create_lgr_pending); out_connected: sk_refcnt_debug_inc(newsmcsk); if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; enqueue: - mutex_unlock(&smc_create_lgr_pending); lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); if (lsmc->sk.sk_state == SMC_LISTEN) { smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -896,6 +896,8 @@ enqueue: sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ return; +decline_rdma_unlock: + mutex_unlock(&smc_create_lgr_pending); decline_rdma: /* RDMA setup failed, switch back to TCP */ smc_conn_free(&new_smc->conn); @@ -907,6 +909,8 @@ decline_rdma: } goto out_connected; +out_err_unlock: + mutex_unlock(&smc_create_lgr_pending); out_err: newsmcsk->sk_state = SMC_CLOSED; smc_conn_free(&new_smc->conn); -- cgit v1.2.3 From 60e2a7780793bae0debc275a9ccd57f7da0cf195 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 25 Oct 2017 11:01:45 +0200 Subject: tcp: TCP experimental option for SMC The SMC protocol [1] relies on the use of a new TCP experimental option [2, 3]. With this option, SMC capabilities are exchanged between peers during the TCP three way handshake. This patch adds support for this experimental option to TCP. References: [1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609 [2] Shared Use of TCP Experimental Options RFC 6994: https://tools.ietf.org/rfc/rfc6994.txt [3] IANA ExID SMCR: http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-exids Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 +++++ net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 19 +++++++++++++++ net/ipv4/tcp_output.c | 63 +++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 120 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8f36277e82e9..f6e1c00e300e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include #include +#include #include #include @@ -302,6 +303,11 @@ EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); +#if IS_ENABLED(CONFIG_SMC) +DEFINE_STATIC_KEY_FALSE(tcp_have_smc); +EXPORT_SYMBOL(tcp_have_smc); +#endif + /* * Current number of TCP sockets. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 893286db4623..337f6011528a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -76,6 +76,8 @@ #include #include #include +#include +#include int sysctl_tcp_fack __read_mostly; int sysctl_tcp_max_reordering __read_mostly = 300; @@ -3737,6 +3739,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } +static void smc_parse_options(const struct tcphdr *th, + struct tcp_options_received *opt_rx, + const unsigned char *ptr, + int opsize) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (th->syn && !(opsize & 1) && + opsize >= TCPOLEN_EXP_SMC_BASE && + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) + opt_rx->smc_ok = 1; + } +#endif +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -3844,6 +3861,9 @@ void tcp_parse_options(const struct net *net, tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); + else + smc_parse_options(th, opt_rx, ptr, + opsize); break; } @@ -5598,6 +5618,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, return false; } +static void smc_check_reset_syn(struct tcp_sock *tp) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && !tp->rx_opt.smc_ok) + tp->syn_smc = 0; + } +#endif +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { @@ -5704,6 +5734,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * is initialized. */ tp->copied_seq = tp->rcv_nxt; + smc_check_reset_syn(tp); + smp_mb(); tcp_finish_connect(sk, skb); @@ -6157,6 +6189,9 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); +#if IS_ENABLED(CONFIG_SMC) + ireq->smc_ok = rx_opt->smc_ok; +#endif } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a952357054f4..056009f1c14f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -416,6 +417,21 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) } EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); +static void smc_check_reset_syn_req(struct tcp_sock *oldtp, + struct request_sock *req, + struct tcp_sock *newtp) +{ +#if IS_ENABLED(CONFIG_SMC) + struct inet_request_sock *ireq; + + if (static_branch_unlikely(&tcp_have_smc)) { + ireq = inet_rsk(req); + if (oldtp->syn_smc && !ireq->smc_ok) + newtp->syn_smc = 0; + } +#endif +} + /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -433,6 +449,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp = tcp_sk(newsk); + struct tcp_sock *oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1f01f4c9c738..c8fc512e0bbb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -41,6 +41,7 @@ #include #include #include +#include #include @@ -422,6 +423,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) +#define OPTION_SMC (1 << 9) + +static void smc_options_write(__be32 *ptr, u16 *options) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (unlikely(OPTION_SMC & *options)) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_EXP << 8) | + (TCPOLEN_EXP_SMC_BASE)); + *ptr++ = htonl(TCPOPT_SMC_MAGIC); + } + } +#endif +} struct tcp_out_options { u16 options; /* bit field of OPTION_* */ @@ -540,6 +557,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } ptr += (len + 3) >> 2; } + + smc_options_write(ptr, &options); +} + +static void smc_set_option(const struct tcp_sock *tp, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif +} + +static void smc_set_option_cond(const struct tcp_sock *tp, + const struct inet_request_sock *ireq, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && ireq->smc_ok) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif } /* Compute TCP options for SYN packets. This is not the final @@ -607,11 +659,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + smc_set_option(tp, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct request_sock *req, +static unsigned int tcp_synack_options(const struct sock *sk, + struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, @@ -667,6 +722,8 @@ static unsigned int tcp_synack_options(struct request_sock *req, } } + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -3195,8 +3252,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); - tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + - sizeof(*th); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, + foc) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); -- cgit v1.2.3 From c5c1cc9c522fc337601213afeb39c3df2eb92d04 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 25 Oct 2017 11:01:46 +0200 Subject: smc: add SMC rendezvous protocol The SMC protocol [1] uses a rendezvous protocol to negotiate SMC capability between peers. The current Linux implementation does not yet use this rendezvous protocol and, thus, is not compliant to RFC7609 and incompatible with other SMC implementations like in zOS. This patch adds support for the SMC rendezvous protocol. It uses a new TCP experimental option. With this option, SMC capabilities are exchanged between the peers during the TCP three way handshake. [1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609 Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 70f7640f5090..6451c5013e06 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -390,6 +390,12 @@ static int smc_connect_rdma(struct smc_sock *smc) int rc = 0; u8 ibport; + if (!tcp_sk(smc->clcsock->sk)->syn_smc) { + /* peer has not signalled SMC-capability */ + smc->use_fallback = true; + goto out_connected; + } + /* IPSec connections opt out of SMC-R optimizations */ if (using_ipsec(smc)) { reason_code = SMC_CLC_DECL_IPSEC; @@ -555,6 +561,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, } smc_copy_sock_settings_to_clc(smc); + tcp_sk(smc->clcsock->sk)->syn_smc = 1; rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc) goto out; @@ -759,6 +766,12 @@ static void smc_listen_work(struct work_struct *work) u8 prefix_len; u8 ibport; + /* check if peer is smc capable */ + if (!tcp_sk(newclcsock->sk)->syn_smc) { + new_smc->use_fallback = true; + goto out_connected; + } + /* do inband token exchange - *wait for and receive SMC Proposal CLC message */ @@ -967,6 +980,7 @@ static int smc_listen(struct socket *sock, int backlog) * them to the clc socket -- copy smc socket options to clc socket */ smc_copy_sock_settings_to_clc(smc); + tcp_sk(smc->clcsock->sk)->syn_smc = 1; rc = kernel_listen(smc->clcsock, backlog); if (rc) @@ -1409,6 +1423,7 @@ static int __init smc_init(void) goto out_sock; } + static_branch_enable(&tcp_have_smc); return 0; out_sock: @@ -1433,6 +1448,7 @@ static void __exit smc_exit(void) list_del_init(&lgr->list); smc_lgr_free(lgr); /* free link group */ } + static_branch_disable(&tcp_have_smc); smc_ib_unregister_client(); sock_unregister(PF_SMC); proto_unregister(&smc_proto); -- cgit v1.2.3 From 2fc5f83b92ba93f8f7119461d998bcdb1ac519ac Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 26 Oct 2017 06:31:35 -0500 Subject: net: xfrm_user: use BUG_ON instead of if condition followed by BUG Use BUG_ON instead of if condition followed by BUG. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index f7a12aa15086..bbc390ec6d29 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1146,13 +1146,14 @@ static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, u32 *flags = nlmsg_data(nlh); u32 sportid = NETLINK_CB(skb).portid; u32 seq = nlh->nlmsg_seq; + int err; r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC); if (r_skb == NULL) return -ENOMEM; - if (build_spdinfo(r_skb, net, sportid, seq, *flags) < 0) - BUG(); + err = build_spdinfo(r_skb, net, sportid, seq, *flags); + BUG_ON(err < 0); return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); } @@ -1204,13 +1205,14 @@ static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, u32 *flags = nlmsg_data(nlh); u32 sportid = NETLINK_CB(skb).portid; u32 seq = nlh->nlmsg_seq; + int err; r_skb = nlmsg_new(xfrm_sadinfo_msgsize(), GFP_ATOMIC); if (r_skb == NULL) return -ENOMEM; - if (build_sadinfo(r_skb, net, sportid, seq, *flags) < 0) - BUG(); + err = build_sadinfo(r_skb, net, sportid, seq, *flags); + BUG_ON(err < 0); return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); } @@ -1957,8 +1959,9 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; - if (build_aevent(r_skb, x, &c) < 0) - BUG(); + err = build_aevent(r_skb, x, &c); + BUG_ON(err < 0); + err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid); spin_unlock_bh(&x->lock); xfrm_state_put(x); @@ -2385,6 +2388,7 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, { struct net *net = &init_net; struct sk_buff *skb; + int err; skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k, !!encap), GFP_ATOMIC); @@ -2392,8 +2396,8 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, return -ENOMEM; /* build migrate */ - if (build_migrate(skb, m, num_migrate, k, sel, encap, dir, type) < 0) - BUG(); + err = build_migrate(skb, m, num_migrate, k, sel, encap, dir, type); + BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MIGRATE); } @@ -2617,13 +2621,14 @@ static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event { struct net *net = xs_net(x); struct sk_buff *skb; + int err; skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; - if (build_aevent(skb, x, c) < 0) - BUG(); + err = build_aevent(skb, x, c); + BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_AEVENTS); } @@ -2830,13 +2835,14 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, { struct net *net = xs_net(x); struct sk_buff *skb; + int err; skb = nlmsg_new(xfrm_acquire_msgsize(x, xp), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; - if (build_acquire(skb, x, xt, xp) < 0) - BUG(); + err = build_acquire(skb, x, xt, xp); + BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_ACQUIRE); } @@ -2945,13 +2951,14 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct { struct net *net = xp_net(xp); struct sk_buff *skb; + int err; skb = nlmsg_new(xfrm_polexpire_msgsize(xp), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; - if (build_polexpire(skb, xp, dir, c) < 0) - BUG(); + err = build_polexpire(skb, xp, dir, c); + BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE); } @@ -3106,13 +3113,14 @@ static int xfrm_send_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr) { struct sk_buff *skb; + int err; skb = nlmsg_new(xfrm_report_msgsize(), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; - if (build_report(skb, proto, sel, addr) < 0) - BUG(); + err = build_report(skb, proto, sel, addr); + BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_REPORT); } @@ -3153,6 +3161,7 @@ static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, { struct net *net = xs_net(x); struct sk_buff *skb; + int err; if (x->id.proto != IPPROTO_ESP) return -EINVAL; @@ -3164,8 +3173,8 @@ static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, if (skb == NULL) return -ENOMEM; - if (build_mapping(skb, x, ipaddr, sport) < 0) - BUG(); + err = build_mapping(skb, x, ipaddr, sport); + BUG_ON(err < 0); return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MAPPING); } -- cgit v1.2.3 From 4dc12ffeaeac939097a3f55c881d3dc3523dff0c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 25 Oct 2017 15:57:55 +0200 Subject: l2tp: cleanup l2tp_tunnel_delete calls l2tp_tunnel_delete does not return anything since commit 62b982eeb458 ("l2tp: fix race condition in l2tp_tunnel_delete"). But call sites of l2tp_tunnel_delete still do casts to void to avoid unused return value warnings. Kill these now useless casts. Signed-off-by: Jiri Slaby Cc: Sabrina Dubroca Cc: Guillaume Nault Cc: David S. Miller Cc: netdev@vger.kernel.org Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 2 +- net/l2tp/l2tp_netlink.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 02d61101b108..af22aa8ae35b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1891,7 +1891,7 @@ static __net_exit void l2tp_exit_net(struct net *net) rcu_read_lock_bh(); list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - (void)l2tp_tunnel_delete(tunnel); + l2tp_tunnel_delete(tunnel); } rcu_read_unlock_bh(); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index f5179424eaf1..f04fb347d251 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -282,7 +282,7 @@ static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_DELETE); - (void) l2tp_tunnel_delete(tunnel); + l2tp_tunnel_delete(tunnel); l2tp_tunnel_dec_refcount(tunnel); -- cgit v1.2.3 From eee12df5a0bd5769af5efb72fa95dd1f633a266c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 26 Oct 2017 07:51:06 -0500 Subject: ipv6: esp6: use BUG_ON instead of if condition followed by BUG Use BUG_ON instead of if condition followed by BUG in esp_remove_trailer. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv6/esp6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 1696401fed6c..4000b71bfdc5 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -483,8 +483,8 @@ static inline int esp_remove_trailer(struct sk_buff *skb) goto out; } - if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2)) - BUG(); + ret = skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2); + BUG_ON(ret); ret = -EINVAL; padlen = nexthdr[0]; -- cgit v1.2.3 From 2ae21cf527da0e5cf9d7ee14bd5b0909bb9d1a75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:56 -0700 Subject: tcp: Namespace-ify sysctl_tcp_early_retrans Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 18 +++++++++--------- net/ipv4/tcp_input.c | 1 - net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 4 +++- 4 files changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81d218346cf7..f0f650f020af 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -634,15 +634,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_early_retrans", - .data = &sysctl_tcp_early_retrans, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &four, - }, { .procname = "tcp_min_tso_segs", .data = &sysctl_tcp_min_tso_segs, @@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_early_retrans", + .data = &init_net.ipv4.sysctl_tcp_early_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &four, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 337f6011528a..7656b1e6d504 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -95,7 +95,6 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; -int sysctl_tcp_early_retrans __read_mostly = 3; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 23a8100af5ad..7ab313f6768e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2484,6 +2484,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c8fc512e0bbb..21713836d46a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2435,6 +2435,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, rto_delta_us; + int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. @@ -2442,10 +2443,11 @@ bool tcp_schedule_loss_probe(struct sock *sk) if (tp->fastopen_rsk) return false; + early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || + if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || icsk->icsk_ca_state != TCP_CA_Open) return false; -- cgit v1.2.3 From e20223f1962831d1b1c416d59d259879d0639d68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:57 -0700 Subject: tcp: Namespace-ify sysctl_tcp_recovery Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_recovery.c | 2 -- 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index f0f650f020af..78019adcae87 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -449,13 +449,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_recovery", - .data = &sysctl_tcp_recovery, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "tcp_max_reordering", .data = &sysctl_tcp_max_reordering, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &zero, .extra2 = &four, }, + { + .procname = "tcp_recovery", + .data = &init_net.ipv4.sysctl_tcp_recovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7656b1e6d504..5b2272dbf6a9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2788,7 +2788,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) struct tcp_sock *tp = tcp_sk(sk); /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; tcp_rack_mark_lost(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7ab313f6768e..517ff1948a71 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2485,6 +2485,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; net->ipv4.sysctl_tcp_early_retrans = 3; + net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index cda6074a429a..d3603a9e24ea 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -1,8 +1,6 @@ #include #include -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; - static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From 2c04ac8ae0b61e0780a30b7069a11bb202b21f26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:58 -0700 Subject: tcp: Namespace-ify sysctl_tcp_thin_linear_timeouts Note that sysctl_tcp_thin_dupack was not used, I deleted it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_timer.c | 4 +--- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 78019adcae87..12003214f4d8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -620,13 +620,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, - { - .procname = "tcp_thin_linear_timeouts", - .data = &sysctl_tcp_thin_linear_timeouts, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_min_tso_segs", .data = &sysctl_tcp_min_tso_segs, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_thin_linear_timeouts", + .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 804a8d34ce86..035a1ef1f2d8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,8 +22,6 @@ #include #include -int sysctl_tcp_thin_linear_timeouts __read_mostly; - /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -522,7 +520,7 @@ out_reset_timer: * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && - (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; -- cgit v1.2.3 From b510f0d23a47c3d1f074fe583e7867dc4918fe02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:59 -0700 Subject: tcp: Namespace-ify sysctl_tcp_slow_start_after_idle Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 5 +---- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 12003214f4d8..40d69af8b363 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -571,13 +571,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_slow_start_after_idle", - .data = &sysctl_tcp_slow_start_after_idle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_slow_start_after_idle", + .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 517ff1948a71..cea63a4b5965 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2486,6 +2486,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_timestamps = 1; net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; + net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 21713836d46a..bdc288a06f94 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -62,9 +62,6 @@ int sysctl_tcp_limit_output_bytes __read_mostly = 262144; */ int sysctl_tcp_tso_win_divisor __read_mostly = 3; -/* By default, RFC2861 behavior. */ -int sysctl_tcp_slow_start_after_idle __read_mostly = 1; - static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -1690,7 +1687,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if (sysctl_tcp_slow_start_after_idle && + if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); -- cgit v1.2.3 From e0a1e5b519236dc1662ff25e42560dd1be9e3776 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:00 -0700 Subject: tcp: Namespace-ify sysctl_tcp_retrans_collapse Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 5 +---- 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 40d69af8b363..533b92ad39dd 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -386,13 +386,6 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, } static struct ctl_table ipv4_table[] = { - { - .procname = "tcp_retrans_collapse", - .data = &sysctl_tcp_retrans_collapse, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_retrans_collapse", + .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cea63a4b5965..2bc6ba2059d3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2487,7 +2487,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ - + net->ipv4.sysctl_tcp_retrans_collapse = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bdc288a06f94..55a0aa4b96df 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,9 +45,6 @@ #include -/* People can turn this off for buggy TCP's found in printers etc. */ -int sysctl_tcp_retrans_collapse __read_mostly = 1; - /* People can turn this on to work with those rare, broken TCPs that * interpret the window field as a signed quantity. */ @@ -2804,7 +2801,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, struct sk_buff *skb = to, *tmp; bool first = true; - if (!sysctl_tcp_retrans_collapse) + if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; -- cgit v1.2.3 From 3f4c7c6f6a9053493ce7dd8a0f17ed8eafc53893 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:01 -0700 Subject: tcp: Namespace-ify sysctl_tcp_stdurg Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 3 +-- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 533b92ad39dd..a34bb75815c1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -400,13 +400,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_stdurg", - .data = &sysctl_tcp_stdurg, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_rfc1337", .data = &sysctl_tcp_rfc1337, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_stdurg", + .data = &init_net.ipv4.sysctl_tcp_stdurg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5b2272dbf6a9..14b06963c102 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -89,7 +89,6 @@ EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ int sysctl_tcp_challenge_ack_limit = 1000; -int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; @@ -5123,7 +5122,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); - if (ptr && !sysctl_tcp_stdurg) + if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) ptr--; ptr += ntohl(th->seq); -- cgit v1.2.3 From 625357aa175c688d219da43c8cfaa2e1629e0e1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:02 -0700 Subject: tcp: Namespace-ify sysctl_tcp_rfc1337 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 1 - net/ipv4/tcp_minisocks.c | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a34bb75815c1..832e554235df 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -400,13 +400,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_rfc1337", - .data = &sysctl_tcp_rfc1337, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_rfc1337", + .data = &init_net.ipv4.sysctl_tcp_rfc1337, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 14b06963c102..64fde81b0eb7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -89,7 +89,6 @@ EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ int sysctl_tcp_challenge_ack_limit = 1000; -int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 056009f1c14f..11836667763c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -181,7 +181,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if (sysctl_tcp_rfc1337 == 0) { + if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; -- cgit v1.2.3 From 65c9410cf55ecf32da1b720f563365d565d6289a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:03 -0700 Subject: tcp: Namespace-ify sysctl_tcp_abort_on_overflow Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_minisocks.c | 4 +--- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 832e554235df..ffd1fd769bba 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -393,13 +393,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_abort_on_overflow", - .data = &sysctl_tcp_abort_on_overflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_abort_on_overflow", + .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 11836667763c..3674d63170b2 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -29,8 +29,6 @@ #include #include -int sysctl_tcp_abort_on_overflow __read_mostly; - static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -783,7 +781,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: - if (!sysctl_tcp_abort_on_overflow) { + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; } -- cgit v1.2.3 From 0bc65a28ae2aeb14aab7f4a930e0d8cf4cad9dc4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:04 -0700 Subject: tcp: Namespace-ify sysctl_tcp_fack Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 3 +-- net/ipv4/tcp_minisocks.c | 2 +- 4 files changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ffd1fd769bba..1f23be13ce7b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -414,13 +414,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "tcp_fack", - .data = &sysctl_tcp_fack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_max_reordering", .data = &sysctl_tcp_max_reordering, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_fack", + .data = &init_net.ipv4.sysctl_tcp_fack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f6e1c00e300e..c7c983f0f817 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2517,7 +2517,7 @@ static int tcp_repair_options_est(struct sock *sk, return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; - if (sysctl_tcp_fack) + if (sock_net(sk)->ipv4.sysctl_tcp_fack) tcp_enable_fack(tp); break; case TCPOPT_TIMESTAMP: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 64fde81b0eb7..c5b94460793f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,7 +79,6 @@ #include #include -int sysctl_tcp_fack __read_mostly; int sysctl_tcp_max_reordering __read_mostly = 300; int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; @@ -5720,7 +5719,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tcp_is_sack(tp) && sysctl_tcp_fack) + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack) tcp_enable_fack(tp); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3674d63170b2..3270ab8416ce 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -510,7 +510,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { - if (sysctl_tcp_fack) + if (sock_net(sk)->ipv4.sysctl_tcp_fack) tcp_enable_fack(newtp); } newtp->window_clamp = req->rsk_window_clamp; -- cgit v1.2.3 From c6e218035913e14952b04ceecf1a543205106fdb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:06 -0700 Subject: tcp: Namespace-ify sysctl_tcp_max_reordering Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 3 +-- net/ipv4/tcp_ipv4.c | 2 ++ 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1f23be13ce7b..18cd228a2069 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -414,13 +414,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "tcp_max_reordering", - .data = &sysctl_tcp_max_reordering, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_dsack", .data = &sysctl_tcp_dsack, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_max_reordering", + .data = &init_net.ipv4.sysctl_tcp_max_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c5b94460793f..c118657f06ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,7 +79,6 @@ #include #include -int sysctl_tcp_max_reordering __read_mostly = 300; int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; @@ -889,7 +888,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, return; if (metric > tp->reordering) { - tp->reordering = min(sysctl_tcp_max_reordering, metric); + tp->reordering = min(sock_net(sk)->ipv4.sysctl_tcp_max_reordering, metric); #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2bc6ba2059d3..c379a242abb3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2488,6 +2488,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ net->ipv4.sysctl_tcp_retrans_collapse = 1; + net->ipv4.sysctl_tcp_max_reordering = 300; + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; -- cgit v1.2.3 From 6496f6bde0c323fba5e8c5b5cbf3a7bf28dad7ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:07 -0700 Subject: tcp: Namespace-ify sysctl_tcp_dsack Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 5 ++--- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 18cd228a2069..7652a9c2a65d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -414,13 +414,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "tcp_dsack", - .data = &sysctl_tcp_dsack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_mem", .maxlen = sizeof(sysctl_tcp_mem), @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_dsack", + .data = &init_net.ipv4.sysctl_tcp_dsack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c118657f06ee..fd77037ac800 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,7 +79,6 @@ #include #include -int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); @@ -4150,7 +4149,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { int mib_idx; if (before(seq, tp->rcv_nxt)) @@ -4185,7 +4184,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c379a242abb3..d9d4d191e8f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2489,6 +2489,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ net->ipv4.sysctl_tcp_retrans_collapse = 1; net->ipv4.sysctl_tcp_max_reordering = 300; + net->ipv4.sysctl_tcp_dsack = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From 0c12654ac6d9004b9538b2a969b2b59e9a5ed831 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:08 -0700 Subject: tcp: Namespace-ify sysctl_tcp_app_win Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7652a9c2a65d..e057788834a9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, - { - .procname = "tcp_app_win", - .data = &sysctl_tcp_app_win, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_adv_win_scale", .data = &sysctl_tcp_adv_win_scale, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_app_win", + .data = &init_net.ipv4.sysctl_tcp_app_win, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fd77037ac800..6af4b58ac6d5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,7 +79,6 @@ #include #include -int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); @@ -428,6 +427,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) */ void tcp_init_buffer_space(struct sock *sk) { + int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; struct tcp_sock *tp = tcp_sk(sk); int maxwin; @@ -446,14 +446,14 @@ void tcp_init_buffer_space(struct sock *sk) if (tp->window_clamp >= maxwin) { tp->window_clamp = maxwin; - if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) + if (tcp_app_win && maxwin > 4 * tp->advmss) tp->window_clamp = max(maxwin - - (maxwin >> sysctl_tcp_app_win), + (maxwin >> tcp_app_win), 4 * tp->advmss); } /* Force reservation of one segment. */ - if (sysctl_tcp_app_win && + if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9d4d191e8f3..189664ebd28e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2490,6 +2490,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_retrans_collapse = 1; net->ipv4.sysctl_tcp_max_reordering = 300; net->ipv4.sysctl_tcp_dsack = 1; + net->ipv4.sysctl_tcp_app_win = 31; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From 94f0893e0c27219f4a726932618505aab6795973 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:09 -0700 Subject: tcp: Namespace-ify sysctl_tcp_adv_win_scale Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 18 +++++++++--------- net/ipv4/tcp_input.c | 13 +++++-------- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 15 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e057788834a9..a95123e1e7da 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -437,15 +437,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, - { - .procname = "tcp_adv_win_scale", - .data = &sysctl_tcp_adv_win_scale, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &tcp_adv_win_scale_min, - .extra2 = &tcp_adv_win_scale_max, - }, { .procname = "tcp_frto", .data = &sysctl_tcp_frto, @@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_adv_win_scale", + .data = &init_net.ipv4.sysctl_tcp_adv_win_scale, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_adv_win_scale_min, + .extra2 = &tcp_adv_win_scale_max, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6af4b58ac6d5..8ee2c84b0bc6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,9 +79,6 @@ #include #include -int sysctl_tcp_adv_win_scale __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); - /* rfc5961 challenge ack rate limiting */ int sysctl_tcp_challenge_ack_limit = 1000; @@ -363,8 +360,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(skb->truesize) >> 1; - int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; + int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; + int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -389,7 +386,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ - if (tcp_win_from_space(skb->truesize) <= skb->len) + if (tcp_win_from_space(sk, skb->truesize) <= skb->len) incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb); @@ -630,7 +627,7 @@ void tcp_rcv_space_adjust(struct sock *sk) } rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < tp->advmss) + while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); @@ -4809,7 +4806,7 @@ restart: * overlaps to the next one. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && - (tcp_win_from_space(skb->truesize) > skb->len || + (tcp_win_from_space(sk, skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; break; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 189664ebd28e..1fe30fb99308 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2491,6 +2491,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_max_reordering = 300; net->ipv4.sysctl_tcp_dsack = 1; net->ipv4.sysctl_tcp_app_win = 31; + net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From af9b69a7a6ca6b817e8d6f416e7aa5b2a5bf1d91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:10 -0700 Subject: tcp: Namespace-ify sysctl_tcp_frto Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 3 +-- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a95123e1e7da..f1bcb9b7e082 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, - { - .procname = "tcp_frto", - .data = &sysctl_tcp_frto, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_min_rtt_wlen", .data = &sysctl_tcp_min_rtt_wlen, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &tcp_adv_win_scale_min, .extra2 = &tcp_adv_win_scale_max, }, + { + .procname = "tcp_frto", + .data = &init_net.ipv4.sysctl_tcp_frto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8ee2c84b0bc6..90d76f1c8f96 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -83,7 +83,6 @@ int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; @@ -2026,7 +2025,7 @@ void tcp_enter_loss(struct sock *sk) * falsely raise the receive window, which results in repeated * timeouts and stop-and-go behavior. */ - tp->frto = sysctl_tcp_frto && + tp->frto = net->ipv4.sysctl_tcp_frto && (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1fe30fb99308..49757c758211 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2492,6 +2492,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_dsack = 1; net->ipv4.sysctl_tcp_app_win = 31; net->ipv4.sysctl_tcp_adv_win_scale = 1; + net->ipv4.sysctl_tcp_frto = 2; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From 399ba77a94e1ee01f747b168c429a121164ac962 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 25 Oct 2017 17:32:05 -0700 Subject: net: dsa: Simplify dsa_slave_phy_setup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the code that tried to identify if a PHY designated by Device Tree required diversion through the DSA-created MDIO bus. This was created mainly for the bcm_sf2.c driver back when it did not have its own MDIO bus driver, which it now has since 461cd1b03e32 ("net: dsa: bcm_sf2: Register our slave MDIO bus"). Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Reviewed-by: Andrew Lunn Tested-by: Martin Hundebøll Signed-off-by: David S. Miller --- net/dsa/slave.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d0ae7010ea45..808e205227c3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1060,28 +1060,10 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) phy_flags = ds->ops->get_phy_flags(ds, dp->index); if (phy_dn) { - int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); - - /* If this PHY address is part of phys_mii_mask, which means - * that we need to divert reads and writes to/from it, then we - * want to bind this device using the slave MII bus created by - * DSA to make that happen. - */ - if (!phy_is_fixed && phy_id >= 0 && - (ds->phys_mii_mask & (1 << phy_id))) { - ret = dsa_slave_phy_connect(slave_dev, phy_id); - if (ret) { - netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret); - of_node_put(phy_dn); - return ret; - } - } else { - slave_dev->phydev = of_phy_connect(slave_dev, phy_dn, - dsa_slave_adjust_link, - phy_flags, - p->phy_interface); - } - + slave_dev->phydev = of_phy_connect(slave_dev, phy_dn, + dsa_slave_adjust_link, + phy_flags, + p->phy_interface); of_node_put(phy_dn); } -- cgit v1.2.3 From 535f010d4ba798214bbd88ce18a95cd99cd4a8cb Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Thu, 26 Oct 2017 11:00:49 +0200 Subject: net: dsa: lan9303: Learn addresses on CPU port when bridged When CPU transmit directly to port using tag, the LAN9303 does not learn MAC addresses received on the CPU port into the ALR. ALR learning is performed only when transmitting using ALR lookup. Solution: If the two external ports are bridged and the packet is not STP BPDU, then use ALR lookup to allow ALR learning on CPU port. Otherwise transmit directly to port with STP state override. Signed-off-by: Egil Hjelmeland Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/tag_lan9303.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 57519597c6fc..64092325aac3 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -11,6 +11,7 @@ * GNU General Public License for more details. * */ +#include #include #include #include @@ -39,6 +40,23 @@ */ #define LAN9303_TAG_LEN 4 +# define LAN9303_TAG_TX_USE_ALR BIT(3) +# define LAN9303_TAG_TX_STP_OVERRIDE BIT(4) +#define eth_stp_addr eth_reserved_addr_base + +/* Decide whether to transmit using ALR lookup, or transmit directly to + * port using tag. ALR learning is performed only when using ALR lookup. + * If the two external ports are bridged and the packet is not STP BPDU, + * then use ALR lookup to allow ALR learning on CPU port. + * Otherwise transmit directly to port with STP state override. + * See also: lan9303_separate_ports() and lan9303.pdf 6.4.10.1 + */ +static int lan9303_xmit_use_arl(struct dsa_port *dp, u8 *dest_addr) +{ + struct lan9303 *chip = dp->ds->priv; + + return chip->is_bridged && !ether_addr_equal(dest_addr, eth_stp_addr); +} static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -62,7 +80,10 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) lan9303_tag = (u16 *)(skb->data + 2 * ETH_ALEN); lan9303_tag[0] = htons(ETH_P_8021Q); - lan9303_tag[1] = htons(dp->index | BIT(4)); + lan9303_tag[1] = lan9303_xmit_use_arl(dp, skb->data) ? + LAN9303_TAG_TX_USE_ALR : + dp->index | LAN9303_TAG_TX_STP_OVERRIDE; + lan9303_tag[1] = htons(lan9303_tag[1]); return skb; } -- cgit v1.2.3 From 4a5b85ffe2a001b52d165931ad05d2d620daca3c Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:55 -0400 Subject: net: dsa: use dsa_is_user_port everywhere Most of the DSA code still check ds->enabled_port_mask directly to inspect a given port type instead of using the provided dsa_is_user_port helper. Change this. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 2 +- net/dsa/legacy.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index a3abf7a7b9a2..fe0081730305 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -201,7 +201,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_PM_SLEEP static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { - return ds->enabled_port_mask & (1 << p) && ds->ports[p].slave; + return dsa_is_user_port(ds, p) && ds->ports[p].slave; } int dsa_switch_suspend(struct dsa_switch *ds) diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 93c1c43bcc58..0b79c6171d0d 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -190,7 +190,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, ds->ports[i].dn = cd->port_dn[i]; ds->ports[i].cpu_dp = dst->cpu_dp; - if (!(ds->enabled_port_mask & (1 << i))) + if (dsa_is_user_port(ds, i)) continue; ret = dsa_slave_create(&ds->ports[i], cd->port_names[i]); @@ -258,7 +258,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) /* Destroy network devices for physical switch ports. */ for (port = 0; port < ds->num_ports; port++) { - if (!(ds->enabled_port_mask & (1 << port))) + if (!dsa_is_user_port(ds, port)) continue; if (!ds->ports[port].slave) -- cgit v1.2.3 From 02bc6e546e858b209c3ebe380a13a73b333b1b3f Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:56 -0400 Subject: net: dsa: introduce dsa_user_ports helper Introduce a dsa_user_ports() helper to return the ds->enabled_port_mask mask which is more explicit. This will also minimize diffs when touching this internal mask. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 2 +- net/dsa/legacy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 62485a57dbfc..d43c59c91058 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -312,7 +312,7 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) * the slave MDIO bus driver rely on these values for probing PHY * devices or not */ - ds->phys_mii_mask = ds->enabled_port_mask; + ds->phys_mii_mask |= dsa_user_ports(ds); /* Add the switch to devlink before calling setup, so that setup can * add dpipe tables diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 0b79c6171d0d..fa543c4a6061 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -136,7 +136,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, /* Make the built-in MII bus mask match the number of ports, * switch drivers can override this later */ - ds->phys_mii_mask = ds->enabled_port_mask; + ds->phys_mii_mask |= dsa_user_ports(ds); /* * If the CPU connects to this switch, set the switch tree -- cgit v1.2.3 From 057cad2c59d73b0c4a6638546f3099d6fb444094 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:57 -0400 Subject: net: dsa: define port types Introduce an enumerated type for ports, which will be way more explicit to identify a port type instead of digging into switch port masks. A port can be of type CPU, DSA, user, or unused by default. This is a static parsed information that cannot be changed at runtime. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 3 +++ net/dsa/legacy.c | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index d43c59c91058..dd6f35b92937 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -185,6 +185,7 @@ static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) return err; ds->dsa_port_mask |= BIT(index); + port->type = DSA_PORT_TYPE_DSA; } return 0; @@ -504,6 +505,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, * net/dsa/dsa.c::dsa_switch_setup_one does. */ ds->cpu_port_mask |= BIT(index); + port->type = DSA_PORT_TYPE_CPU; tag_protocol = ds->ops->get_tag_protocol(ds); tag_ops = dsa_resolve_tag_protocol(tag_protocol); @@ -543,6 +545,7 @@ static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) * net/dsa/dsa.c::dsa_switch_setup_one does. */ ds->enabled_port_mask |= BIT(index); + port->type = DSA_PORT_TYPE_USER; } } diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index fa543c4a6061..9fd5b3adab1e 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -101,6 +101,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct dsa_chip_data *cd = ds->cd; bool valid_name_found = false; int index = ds->index; + struct dsa_port *dp; int i, ret; /* @@ -109,6 +110,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, for (i = 0; i < ds->num_ports; i++) { char *name; + dp = &ds->ports[i]; + name = cd->port_names[i]; if (name == NULL) continue; @@ -122,10 +125,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, dst->cpu_dp = &ds->ports[i]; dst->cpu_dp->master = master; ds->cpu_port_mask |= 1 << i; + dp->type = DSA_PORT_TYPE_CPU; } else if (!strcmp(name, "dsa")) { ds->dsa_port_mask |= 1 << i; + dp->type = DSA_PORT_TYPE_DSA; } else { ds->enabled_port_mask |= 1 << i; + dp->type = DSA_PORT_TYPE_USER; } valid_name_found = true; } -- cgit v1.2.3 From 5749f0f3772b9d98f37e3a92539f49fafaa64eca Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:59 -0400 Subject: net: dsa: remove port masks Now that DSA core provides port types, there is no need to keep this information at the switch level. This is a static information that is part of a DSA core dsa_port structure. Remove them. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 11 ----------- net/dsa/legacy.c | 3 --- 2 files changed, 14 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dd6f35b92937..ec58654a71cd 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -184,7 +184,6 @@ static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (err != 0) return err; - ds->dsa_port_mask |= BIT(index); port->type = DSA_PORT_TYPE_DSA; } @@ -500,11 +499,6 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, dst->cpu_dp->master = ethernet_dev; } - /* Initialize cpu_port_mask now for drv->setup() - * to have access to a correct value, just like what - * net/dsa/dsa.c::dsa_switch_setup_one does. - */ - ds->cpu_port_mask |= BIT(index); port->type = DSA_PORT_TYPE_CPU; tag_protocol = ds->ops->get_tag_protocol(ds); @@ -540,11 +534,6 @@ static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (err) return err; } else { - /* Initialize enabled_port_mask now for drv->setup() - * to have access to a correct value, just like what - * net/dsa/dsa.c::dsa_switch_setup_one does. - */ - ds->enabled_port_mask |= BIT(index); port->type = DSA_PORT_TYPE_USER; } diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 9fd5b3adab1e..93e1b116ef83 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -124,13 +124,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, } dst->cpu_dp = &ds->ports[i]; dst->cpu_dp->master = master; - ds->cpu_port_mask |= 1 << i; dp->type = DSA_PORT_TYPE_CPU; } else if (!strcmp(name, "dsa")) { - ds->dsa_port_mask |= 1 << i; dp->type = DSA_PORT_TYPE_DSA; } else { - ds->enabled_port_mask |= 1 << i; dp->type = DSA_PORT_TYPE_USER; } valid_name_found = true; -- cgit v1.2.3 From 26aa0459fad28725aa0bc12a3615cc9a0bd7118f Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Mon, 16 Oct 2017 18:01:23 -0700 Subject: net/sched: Check for null dev_queue on create flow In qdisc_alloc() the dev_queue pointer was used without any checks being performed. If qdisc_create() gets a null dev_queue pointer, it just passes it along to qdisc_alloc(), leading to a crash. That happens if a root qdisc implements select_queue() and returns a null dev_queue pointer for an "invalid handle", for example, or if the dev_queue associated with the parent qdisc is null. This patch is in preparation for the next in this series, where select_queue() is being added to mqprio and as it may return a null dev_queue. Signed-off-by: Jesus Sanchez-Palencia Tested-by: Henrik Austad Signed-off-by: Jeff Kirsher --- net/sched/sch_generic.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6ced7c89c141..aa74aa42b5d7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -603,8 +603,14 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, struct Qdisc *sch; unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size; int err = -ENOBUFS; - struct net_device *dev = dev_queue->dev; + struct net_device *dev; + + if (!dev_queue) { + err = -EINVAL; + goto errout; + } + dev = dev_queue->dev; p = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); -- cgit v1.2.3 From ce8a75f60b75831947569229616cf1bc94f2d965 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Mon, 16 Oct 2017 18:01:24 -0700 Subject: net/sched: Change behavior of mq select_queue() Currently, the class_ops select_queue() implementation on sch_mq returns a pointer to netdev_queue #0 when it receives and invalid qdisc id. That can be misleading since all of mq's inner qdiscs are attached to a valid netdev_queue. Here we fix that by returning NULL when a qdisc id is invalid. This is aligned with how select_queue() is implemented for sch_mqprio in the next patch on this series, keeping a consistent behavior between these two qdiscs. Signed-off-by: Jesus Sanchez-Palencia Tested-by: Henrik Austad Signed-off-by: Jeff Kirsher --- net/sched/sch_mq.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index f3a3e507422b..213b586a06a0 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -130,15 +130,7 @@ static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) static struct netdev_queue *mq_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { - unsigned int ntx = TC_H_MIN(tcm->tcm_parent); - struct netdev_queue *dev_queue = mq_queue_get(sch, ntx); - - if (!dev_queue) { - struct net_device *dev = qdisc_dev(sch); - - return netdev_get_tx_queue(dev, 0); - } - return dev_queue; + return mq_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, -- cgit v1.2.3 From 0f7787b4133fb26b6dc0779e4867408e07711d8e Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Mon, 16 Oct 2017 18:01:25 -0700 Subject: net/sched: Add select_queue() class_ops for mqprio When replacing a child qdisc from mqprio, tc_modify_qdisc() must fetch the netdev_queue pointer that the current child qdisc is associated with before creating the new qdisc. Currently, when using mqprio as root qdisc, the kernel will end up getting the queue #0 pointer from the mqprio (root qdisc), which leaves any new child qdisc with a possibly wrong netdev_queue pointer. Implementing the Qdisc_class_ops select_queue() on mqprio fixes this issue and avoid an inconsistent state when child qdiscs are replaced. Signed-off-by: Jesus Sanchez-Palencia Tested-by: Henrik Austad Signed-off-by: Jeff Kirsher --- net/sched/sch_mqprio.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 51c2b289c69b..4d5ed45123f0 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -575,6 +575,12 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } +static struct netdev_queue *mqprio_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) +{ + return mqprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); +} + static const struct Qdisc_class_ops mqprio_class_ops = { .graft = mqprio_graft, .leaf = mqprio_leaf, @@ -582,6 +588,7 @@ static const struct Qdisc_class_ops mqprio_class_ops = { .walk = mqprio_walk, .dump = mqprio_dump_class, .dump_stats = mqprio_dump_class_stats, + .select_queue = mqprio_select_queue, }; static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { -- cgit v1.2.3 From 585d763af09cc21daf48ecc873604ccdb70f6014 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 16 Oct 2017 18:01:26 -0700 Subject: net/sched: Introduce Credit Based Shaper (CBS) qdisc This queueing discipline implements the shaper algorithm defined by the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L. It's primary usage is to apply some bandwidth reservation to user defined traffic classes, which are mapped to different queues via the mqprio qdisc. Only a simple software implementation is added for now. Signed-off-by: Vinicius Costa Gomes Signed-off-by: Jesus Sanchez-Palencia Tested-by: Henrik Austad Signed-off-by: Jeff Kirsher --- net/sched/Kconfig | 11 ++ net/sched/Makefile | 1 + net/sched/sch_cbs.c | 293 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 305 insertions(+) create mode 100644 net/sched/sch_cbs.c (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index e70ed26485a2..c03d86a7775e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -172,6 +172,17 @@ config NET_SCH_TBF To compile this code as a module, choose M here: the module will be called sch_tbf. +config NET_SCH_CBS + tristate "Credit Based Shaper (CBS)" + ---help--- + Say Y here if you want to use the Credit Based Shaper (CBS) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_cbs. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 7b915d226de7..80c8f92d162d 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o +obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c new file mode 100644 index 000000000000..0e85133c5653 --- /dev/null +++ b/net/sched/sch_cbs.c @@ -0,0 +1,293 @@ +/* + * net/sched/sch_cbs.c Credit Based Shaper + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Vinicius Costa Gomes + * + */ + +/* Credit Based Shaper (CBS) + * ========================= + * + * This is a simple rate-limiting shaper aimed at TSN applications on + * systems with known traffic workloads. + * + * Its algorithm is defined by the IEEE 802.1Q-2014 Specification, + * Section 8.6.8.2, and explained in more detail in the Annex L of the + * same specification. + * + * There are four tunables to be considered: + * + * 'idleslope': Idleslope is the rate of credits that is + * accumulated (in kilobits per second) when there is at least + * one packet waiting for transmission. Packets are transmitted + * when the current value of credits is equal or greater than + * zero. When there is no packet to be transmitted the amount of + * credits is set to zero. This is the main tunable of the CBS + * algorithm. + * + * 'sendslope': + * Sendslope is the rate of credits that is depleted (it should be a + * negative number of kilobits per second) when a transmission is + * ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section + * 8.6.8.2 item g): + * + * sendslope = idleslope - port_transmit_rate + * + * 'hicredit': Hicredit defines the maximum amount of credits (in + * bytes) that can be accumulated. Hicredit depends on the + * characteristics of interfering traffic, + * 'max_interference_size' is the maximum size of any burst of + * traffic that can delay the transmission of a frame that is + * available for transmission for this traffic class, (IEEE + * 802.1Q-2014 Annex L, Equation L-3): + * + * hicredit = max_interference_size * (idleslope / port_transmit_rate) + * + * 'locredit': Locredit is the minimum amount of credits that can + * be reached. It is a function of the traffic flowing through + * this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2): + * + * locredit = max_frame_size * (sendslope / port_transmit_rate) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BYTES_PER_KBIT (1000LL / 8) + +struct cbs_sched_data { + s64 port_rate; /* in bytes/s */ + s64 last; /* timestamp in ns */ + s64 credits; /* in bytes */ + s32 locredit; /* in bytes */ + s32 hicredit; /* in bytes */ + s64 sendslope; /* in bytes/s */ + s64 idleslope; /* in bytes/s */ + struct qdisc_watchdog watchdog; + int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch); + struct sk_buff *(*dequeue)(struct Qdisc *sch); +}; + +static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + if (sch->q.qlen == 0 && q->credits > 0) { + /* We need to stop accumulating credits when there's + * no enqueued packets and q->credits is positive. + */ + q->credits = 0; + q->last = ktime_get_ns(); + } + + return qdisc_enqueue_tail(skb, sch); +} + +static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + return q->enqueue(skb, sch); +} + +/* timediff is in ns, slope is in bytes/s */ +static s64 timediff_to_credits(s64 timediff, s64 slope) +{ + return div64_s64(timediff * slope, NSEC_PER_SEC); +} + +static s64 delay_from_credits(s64 credits, s64 slope) +{ + if (unlikely(slope == 0)) + return S64_MAX; + + return div64_s64(-credits * NSEC_PER_SEC, slope); +} + +static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate) +{ + if (unlikely(port_rate == 0)) + return S64_MAX; + + return div64_s64(len * slope, port_rate); +} + +static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + s64 now = ktime_get_ns(); + struct sk_buff *skb; + s64 credits; + int len; + + if (q->credits < 0) { + credits = timediff_to_credits(now - q->last, q->idleslope); + + credits = q->credits + credits; + q->credits = min_t(s64, credits, q->hicredit); + + if (q->credits < 0) { + s64 delay; + + delay = delay_from_credits(q->credits, q->idleslope); + qdisc_watchdog_schedule_ns(&q->watchdog, now + delay); + + q->last = now; + + return NULL; + } + } + + skb = qdisc_dequeue_head(sch); + if (!skb) + return NULL; + + len = qdisc_pkt_len(skb); + + /* As sendslope is a negative number, this will decrease the + * amount of q->credits. + */ + credits = credits_from_len(len, q->sendslope, q->port_rate); + credits += q->credits; + + q->credits = max_t(s64, credits, q->locredit); + q->last = now; + + return skb; +} + +static struct sk_buff *cbs_dequeue(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + return q->dequeue(sch); +} + +static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { + [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, +}; + +static int cbs_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_CBS_MAX + 1]; + struct ethtool_link_ksettings ecmd; + struct tc_cbs_qopt *qopt; + s64 link_speed; + int err; + + err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL); + if (err < 0) + return err; + + if (!tb[TCA_CBS_PARMS]) + return -EINVAL; + + qopt = nla_data(tb[TCA_CBS_PARMS]); + + if (qopt->offload) + return -EOPNOTSUPP; + + if (!__ethtool_get_link_ksettings(dev, &ecmd)) + link_speed = ecmd.base.speed; + else + link_speed = SPEED_1000; + + q->port_rate = link_speed * 1000 * BYTES_PER_KBIT; + + q->enqueue = cbs_enqueue_soft; + q->dequeue = cbs_dequeue_soft; + + q->hicredit = qopt->hicredit; + q->locredit = qopt->locredit; + q->idleslope = qopt->idleslope * BYTES_PER_KBIT; + q->sendslope = qopt->sendslope * BYTES_PER_KBIT; + + return 0; +} + +static int cbs_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + if (!opt) + return -EINVAL; + + qdisc_watchdog_init(&q->watchdog, sch); + + return cbs_change(sch, opt); +} + +static void cbs_destroy(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); +} + +static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct tc_cbs_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.hicredit = q->hicredit; + opt.locredit = q->locredit; + opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT); + opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT); + opt.offload = 0; + + if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { + .id = "cbs", + .priv_size = sizeof(struct cbs_sched_data), + .enqueue = cbs_enqueue, + .dequeue = cbs_dequeue, + .peek = qdisc_peek_dequeued, + .init = cbs_init, + .reset = qdisc_reset_queue, + .destroy = cbs_destroy, + .change = cbs_change, + .dump = cbs_dump, + .owner = THIS_MODULE, +}; + +static int __init cbs_module_init(void) +{ + return register_qdisc(&cbs_qdisc_ops); +} + +static void __exit cbs_module_exit(void) +{ + unregister_qdisc(&cbs_qdisc_ops); +} +module_init(cbs_module_init) +module_exit(cbs_module_exit) +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 3d0bd028ffb4a4915cb64cfa0d2cee1578cc0321 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 16 Oct 2017 18:01:27 -0700 Subject: net/sched: Add support for HW offloading for CBS This adds support for offloading the CBS algorithm to the controller, if supported. Drivers wanting to support CBS offload must implement the .ndo_setup_tc callback and handle the TC_SETUP_CBS (introduced here) type. Signed-off-by: Vinicius Costa Gomes Tested-by: Henrik Austad Signed-off-by: Jeff Kirsher --- net/sched/sch_cbs.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 0e85133c5653..bdb533b7fb8c 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -68,6 +68,8 @@ #define BYTES_PER_KBIT (1000LL / 8) struct cbs_sched_data { + bool offload; + int queue; s64 port_rate; /* in bytes/s */ s64 last; /* timestamp in ns */ s64 credits; /* in bytes */ @@ -80,6 +82,11 @@ struct cbs_sched_data { struct sk_buff *(*dequeue)(struct Qdisc *sch); }; +static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch) +{ + return qdisc_enqueue_tail(skb, sch); +} + static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); @@ -169,6 +176,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) return skb; } +static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch) +{ + return qdisc_dequeue_head(sch); +} + static struct sk_buff *cbs_dequeue(struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); @@ -180,14 +192,66 @@ static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, }; +static void cbs_disable_offload(struct net_device *dev, + struct cbs_sched_data *q) +{ + struct tc_cbs_qopt_offload cbs = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + q->enqueue = cbs_enqueue_soft; + q->dequeue = cbs_dequeue_soft; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + cbs.queue = q->queue; + cbs.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs); + if (err < 0) + pr_warn("Couldn't disable CBS offload for queue %d\n", + cbs.queue); +} + +static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, + const struct tc_cbs_qopt *opt) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_cbs_qopt_offload cbs = { }; + int err; + + if (!ops->ndo_setup_tc) + return -EOPNOTSUPP; + + cbs.queue = q->queue; + + cbs.enable = 1; + cbs.hicredit = opt->hicredit; + cbs.locredit = opt->locredit; + cbs.idleslope = opt->idleslope; + cbs.sendslope = opt->sendslope; + + err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs); + if (err < 0) + return err; + + q->enqueue = cbs_enqueue_offload; + q->dequeue = cbs_dequeue_offload; + + return 0; +} + static int cbs_change(struct Qdisc *sch, struct nlattr *opt) { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct nlattr *tb[TCA_CBS_MAX + 1]; - struct ethtool_link_ksettings ecmd; struct tc_cbs_qopt *qopt; - s64 link_speed; int err; err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL); @@ -199,23 +263,30 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt) qopt = nla_data(tb[TCA_CBS_PARMS]); - if (qopt->offload) - return -EOPNOTSUPP; + if (!qopt->offload) { + struct ethtool_link_ksettings ecmd; + s64 link_speed; - if (!__ethtool_get_link_ksettings(dev, &ecmd)) - link_speed = ecmd.base.speed; - else - link_speed = SPEED_1000; + if (!__ethtool_get_link_ksettings(dev, &ecmd)) + link_speed = ecmd.base.speed; + else + link_speed = SPEED_1000; - q->port_rate = link_speed * 1000 * BYTES_PER_KBIT; + q->port_rate = link_speed * 1000 * BYTES_PER_KBIT; - q->enqueue = cbs_enqueue_soft; - q->dequeue = cbs_dequeue_soft; + cbs_disable_offload(dev, q); + } else { + err = cbs_enable_offload(dev, q, qopt); + if (err < 0) + return err; + } + /* Everything went OK, save the parameters used. */ q->hicredit = qopt->hicredit; q->locredit = qopt->locredit; q->idleslope = qopt->idleslope * BYTES_PER_KBIT; q->sendslope = qopt->sendslope * BYTES_PER_KBIT; + q->offload = qopt->offload; return 0; } @@ -223,10 +294,16 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt) static int cbs_init(struct Qdisc *sch, struct nlattr *opt) { struct cbs_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); if (!opt) return -EINVAL; + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + q->enqueue = cbs_enqueue_soft; + q->dequeue = cbs_dequeue_soft; + qdisc_watchdog_init(&q->watchdog, sch); return cbs_change(sch, opt); @@ -235,8 +312,11 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt) static void cbs_destroy(struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); qdisc_watchdog_cancel(&q->watchdog); + + cbs_disable_offload(dev, q); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -253,7 +333,7 @@ static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) opt.locredit = q->locredit; opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT); opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT); - opt.offload = 0; + opt.offload = q->offload; if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; -- cgit v1.2.3 From 57ab1ca215971702df534ae93cd76c15ca084c77 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 10:50:07 -0400 Subject: net: dsa: move fixed link registration helpers The new bindings (dsa2.c) and the old bindings (legacy.c) share two helpers dsa_cpu_dsa_setup and dsa_cpu_dsa_destroy, used to register or deregister a fixed PHY if a given port has a corresponding device node. Unclutter the code by moving them into two new port.c helpers, dsa_port_fixed_link_register_of and dsa_port_fixed_link_(un)register_of. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 39 --------------------------------------- net/dsa/dsa2.c | 8 ++++---- net/dsa/dsa_priv.h | 5 +++-- net/dsa/legacy.c | 4 ++-- net/dsa/port.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 56 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index fe0081730305..b8f2d9f7c3ed 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -68,37 +68,6 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { [DSA_TAG_PROTO_NONE] = &none_ops, }; -int dsa_cpu_dsa_setup(struct dsa_port *port) -{ - struct device_node *port_dn = port->dn; - struct dsa_switch *ds = port->ds; - struct phy_device *phydev; - int ret, mode; - - if (of_phy_is_fixed_link(port_dn)) { - ret = of_phy_register_fixed_link(port_dn); - if (ret) { - dev_err(ds->dev, "failed to register fixed PHY\n"); - return ret; - } - phydev = of_phy_find_device(port_dn); - - mode = of_get_phy_mode(port_dn); - if (mode < 0) - mode = PHY_INTERFACE_MODE_NA; - phydev->interface = mode; - - genphy_config_init(phydev); - genphy_read_status(phydev); - if (ds->ops->adjust_link) - ds->ops->adjust_link(ds, port->index, phydev); - - put_device(&phydev->mdio.dev); - } - - return 0; -} - const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) { const struct dsa_device_ops *ops; @@ -113,14 +82,6 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) return ops; } -void dsa_cpu_dsa_destroy(struct dsa_port *port) -{ - struct device_node *port_dn = port->dn; - - if (of_phy_is_fixed_link(port_dn)) - of_phy_deregister_fixed_link(port_dn); -} - static int dev_is_class(struct device *dev, void *class) { if (dev->class != NULL && !strcmp(dev->class->name, class)) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index ec58654a71cd..de91c48b6806 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -219,7 +219,7 @@ static int dsa_dsa_port_apply(struct dsa_port *port) struct dsa_switch *ds = port->ds; int err; - err = dsa_cpu_dsa_setup(port); + err = dsa_port_fixed_link_register_of(port); if (err) { dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n", port->index, err); @@ -235,7 +235,7 @@ static int dsa_dsa_port_apply(struct dsa_port *port) static void dsa_dsa_port_unapply(struct dsa_port *port) { devlink_port_unregister(&port->devlink_port); - dsa_cpu_dsa_destroy(port); + dsa_port_fixed_link_unregister_of(port); } static int dsa_cpu_port_apply(struct dsa_port *port) @@ -243,7 +243,7 @@ static int dsa_cpu_port_apply(struct dsa_port *port) struct dsa_switch *ds = port->ds; int err; - err = dsa_cpu_dsa_setup(port); + err = dsa_port_fixed_link_register_of(port); if (err) { dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n", port->index, err); @@ -259,7 +259,7 @@ static int dsa_cpu_port_apply(struct dsa_port *port) static void dsa_cpu_port_unapply(struct dsa_port *port) { devlink_port_unregister(&port->devlink_port); - dsa_cpu_dsa_destroy(port); + dsa_port_fixed_link_unregister_of(port); } static int dsa_user_port_apply(struct dsa_port *port) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 1e9914062d0b..1e65afd6989e 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -93,8 +93,6 @@ struct dsa_slave_priv { }; /* dsa.c */ -int dsa_cpu_dsa_setup(struct dsa_port *port); -void dsa_cpu_dsa_destroy(struct dsa_port *dport); const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); bool dsa_schedule_work(struct work_struct *work); @@ -159,6 +157,9 @@ int dsa_port_vlan_add(struct dsa_port *dp, struct switchdev_trans *trans); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); +int dsa_port_fixed_link_register_of(struct dsa_port *dp); +void dsa_port_fixed_link_unregister_of(struct dsa_port *dp); + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 93e1b116ef83..ed7aae342fca 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -86,7 +86,7 @@ static int dsa_cpu_dsa_setups(struct dsa_switch *ds) if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; - ret = dsa_cpu_dsa_setup(&ds->ports[port]); + ret = dsa_port_fixed_link_register_of(&ds->ports[port]); if (ret) return ret; } @@ -274,7 +274,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) for (port = 0; port < ds->num_ports; port++) { if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; - dsa_cpu_dsa_destroy(&ds->ports[port]); + dsa_port_fixed_link_unregister_of(&ds->ports[port]); } if (ds->slave_mii_bus && ds->ops->phy_read) diff --git a/net/dsa/port.c b/net/dsa/port.c index 72c8dbd3d3f2..bb30b1a7de3a 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -12,6 +12,8 @@ #include #include +#include +#include #include "dsa_priv.h" @@ -264,3 +266,48 @@ int dsa_port_vlan_del(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); } + +int dsa_port_fixed_link_register_of(struct dsa_port *dp) +{ + struct device_node *dn = dp->dn; + struct dsa_switch *ds = dp->ds; + struct phy_device *phydev; + int port = dp->index; + int mode; + int err; + + if (of_phy_is_fixed_link(dn)) { + err = of_phy_register_fixed_link(dn); + if (err) { + dev_err(ds->dev, + "failed to register the fixed PHY of port %d\n", + port); + return err; + } + + phydev = of_phy_find_device(dn); + + mode = of_get_phy_mode(dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + phydev->interface = mode; + + genphy_config_init(phydev); + genphy_read_status(phydev); + + if (ds->ops->adjust_link) + ds->ops->adjust_link(ds, port, phydev); + + put_device(&phydev->mdio.dev); + } + + return 0; +} + +void dsa_port_fixed_link_unregister_of(struct dsa_port *dp) +{ + struct device_node *dn = dp->dn; + + if (of_phy_is_fixed_link(dn)) + of_phy_deregister_fixed_link(dn); +} -- cgit v1.2.3 From 74b6551b9f4106a3e69ab46e60b52a8947dcf60c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 26 Oct 2017 23:10:35 -0500 Subject: ipv6: exthdrs: use swap macro in ipv6_dest_hao make use of the swap macro and remove unnecessary variable tmp_addr. This makes the code easier to read and maintain. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 7835dea930b4..9f918a770f87 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -187,7 +187,6 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) struct ipv6_destopt_hao *hao; struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); - struct in6_addr tmp_addr; int ret; if (opt->dsthao) { @@ -229,9 +228,7 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; - tmp_addr = ipv6h->saddr; - ipv6h->saddr = hao->addr; - hao->addr = tmp_addr; + swap(ipv6h->saddr, hao->addr); if (skb->tstamp == 0) __net_timestamp(skb); -- cgit v1.2.3 From ec36e416f06f6a8659281053fdc46ce484ad2211 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:21 -0700 Subject: tcp: Namespace-ify sysctl_tcp_nometrics_save Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_metrics.c | 4 +--- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index f1bcb9b7e082..b742a5e26a9d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -451,13 +451,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_no_metrics_save", - .data = &sysctl_tcp_nometrics_save, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "tcp_moderate_rcvbuf", .data = &sysctl_tcp_moderate_rcvbuf, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_no_metrics_save", + .data = &init_net.ipv4.sysctl_tcp_nometrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0ab78abc811b..0507b56b6d4b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -20,8 +20,6 @@ #include #include -int sysctl_tcp_nometrics_save __read_mostly; - static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash); @@ -330,7 +328,7 @@ void tcp_update_metrics(struct sock *sk) int m; sk_dst_confirm(sk); - if (sysctl_tcp_nometrics_save || !dst) + if (net->ipv4.sysctl_tcp_nometrics_save || !dst) return; rcu_read_lock(); -- cgit v1.2.3 From 4540c0cf98b8892a642d2453eec20ae3eb5696fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:22 -0700 Subject: tcp: Namespace-ify sysctl_tcp_moderate_rcvbuf Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 5 ++--- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b742a5e26a9d..2ebe87fd1169 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -451,13 +451,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_moderate_rcvbuf", - .data = &sysctl_tcp_moderate_rcvbuf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "tcp_tso_win_divisor", .data = &sysctl_tcp_tso_win_divisor, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_moderate_rcvbuf", + .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 90d76f1c8f96..ce481325115f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -84,7 +84,6 @@ int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; -int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ @@ -411,7 +410,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency * Allow enough cushion so that sender is not limited by our window */ - if (sysctl_tcp_moderate_rcvbuf) + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) @@ -602,7 +601,7 @@ void tcp_rcv_space_adjust(struct sock *sk) * */ - if (sysctl_tcp_moderate_rcvbuf && + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvwin, rcvmem, rcvbuf; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 49757c758211..27f376b90913 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2493,6 +2493,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_app_win = 31; net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_frto = 2; + net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From d06a99045837d3f4d5431793c4c390b0daf2a08d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:23 -0700 Subject: tcp: Namespace-ify sysctl_tcp_tso_win_divisor Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_ipv4.c | 5 +++++ net/ipv4/tcp_output.c | 8 +------- 3 files changed, 13 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 2ebe87fd1169..a053cacb8290 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -451,13 +451,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_tso_win_divisor", - .data = &sysctl_tcp_tso_win_divisor, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "tcp_congestion_control", .mode = 0644, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_tso_win_divisor", + .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 27f376b90913..284ff16148df 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2494,6 +2494,11 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_frto = 2; net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; + /* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ + net->ipv4.sysctl_tcp_tso_win_divisor = 3; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 55a0aa4b96df..60df3ab52166 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -53,12 +53,6 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; /* Default TSQ limit of four TSO segments */ int sysctl_tcp_limit_output_bytes __read_mostly = 262144; -/* This limits the percentage of the congestion window which we - * will allow a single TSO frame to consume. Building TSO frames - * which are too large can cause TCP streams to be bursty. - */ -int sysctl_tcp_tso_win_divisor __read_mostly = 3; - static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -1988,7 +1982,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + win_divisor = ACCESS_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); -- cgit v1.2.3 From ceef9ab6be7234f9e49f79769e0da88d1dccfcc7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:24 -0700 Subject: tcp: Namespace-ify sysctl_tcp_workaround_signed_windows Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 14 +++++--------- net/ipv6/syncookies.c | 2 +- 5 files changed, 15 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 77cf32a80952..fda37f2862c9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -385,7 +385,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); - tcp_select_initial_window(tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a053cacb8290..3ae9012a4979 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -457,13 +457,6 @@ static struct ctl_table ipv4_table[] = { .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, - { - .procname = "tcp_workaround_signed_windows", - .data = &sysctl_tcp_workaround_signed_windows, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_limit_output_bytes", .data = &sysctl_tcp_limit_output_bytes, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_workaround_signed_windows", + .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3270ab8416ce..3c65c1a3f944 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -369,7 +369,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(full_space, + tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 60df3ab52166..5bbed67c27e9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,11 +45,6 @@ #include -/* People can turn this on to work with those rare, broken TCPs that - * interpret the window field as a signed quantity. - */ -int sysctl_tcp_workaround_signed_windows __read_mostly = 0; - /* Default TSQ limit of four TSO segments */ int sysctl_tcp_limit_output_bytes __read_mostly = 262144; @@ -196,7 +191,7 @@ u32 tcp_default_init_rwnd(u32 mss) * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ -void tcp_select_initial_window(int __space, __u32 mss, +void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) @@ -220,7 +215,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ - if (sysctl_tcp_workaround_signed_windows) + if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; @@ -280,7 +275,8 @@ static u16 tcp_select_window(struct sock *sk) /* Make sure we do not exceed the maximum possible * scaled window. */ - if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) + if (!tp->rx_opt.rcv_wscale && + sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -3349,7 +3345,7 @@ static void tcp_connect_init(struct sock *sk) if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 4e7817abc0b9..e7a3a6b6cf56 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -244,7 +244,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) } req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); - tcp_select_initial_window(tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); -- cgit v1.2.3 From 9184d8bb448a3d2c2d9f90f1e2f5de625292e769 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:25 -0700 Subject: tcp: Namespace-ify sysctl_tcp_limit_output_bytes Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_ipv4.c | 2 ++ net/ipv4/tcp_output.c | 6 ++---- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3ae9012a4979..6caf5c40730f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -457,13 +457,6 @@ static struct ctl_table ipv4_table[] = { .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, - { - .procname = "tcp_limit_output_bytes", - .data = &sysctl_tcp_limit_output_bytes, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_challenge_ack_limit", .data = &sysctl_tcp_challenge_ack_limit, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_limit_output_bytes", + .data = &init_net.ipv4.sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 284ff16148df..713b80261e4f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2499,6 +2499,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; + /* Default TSQ limit of four TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 262144; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5bbed67c27e9..f018892c6a98 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,9 +45,6 @@ #include -/* Default TSQ limit of four TSO segments */ -int sysctl_tcp_limit_output_bytes __read_mostly = 262144; - static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -2215,7 +2212,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int limit; limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); - limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); + limit = min_t(u32, limit, + sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { -- cgit v1.2.3 From b530b68148301d73775cd27cc136ce4dd5738ae8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:26 -0700 Subject: tcp: Namespace-ify sysctl_tcp_challenge_ack_limit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 14 ++++++-------- net/ipv4/tcp_ipv4.c | 2 ++ 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6caf5c40730f..e28b3b7a7bbc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -457,13 +457,6 @@ static struct ctl_table ipv4_table[] = { .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, - { - .procname = "tcp_challenge_ack_limit", - .data = &sysctl_tcp_challenge_ack_limit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_challenge_ack_limit", + .data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ce481325115f..928048a4e2c5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,9 +79,6 @@ #include #include -/* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 1000; - int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; @@ -3443,10 +3440,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); u32 count, now; /* First check our per-socket dupack rate limit. */ - if (__tcp_oow_rate_limited(sock_net(sk), + if (__tcp_oow_rate_limited(net, LINUX_MIB_TCPACKSKIPPEDCHALLENGE, &tp->last_oow_ack_time)) return; @@ -3454,16 +3452,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { - u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; + u32 half = (ack_limit + 1) >> 1; challenge_timestamp = now; - WRITE_ONCE(challenge_count, half + - prandom_u32_max(sysctl_tcp_challenge_ack_limit)); + WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); } count = READ_ONCE(challenge_count); if (count > 0) { WRITE_ONCE(challenge_count, count - 1); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 713b80261e4f..50ab3a3eced3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2501,6 +2501,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tso_win_divisor = 3; /* Default TSQ limit of four TSO segments */ net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* rfc5961 challenge ack rate limiting */ + net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From 26e9596e5b8f11025b57b12e7265df649129ab00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:27 -0700 Subject: tcp: Namespace-ify sysctl_tcp_min_tso_segs Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 18 +++++++++--------- net/ipv4/tcp.c | 2 -- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 3 ++- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e28b3b7a7bbc..00b4aea3705b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -499,15 +499,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, - { - .procname = "tcp_min_tso_segs", - .data = &sysctl_tcp_min_tso_segs, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &gso_max_segs, - }, { .procname = "tcp_pacing_ss_ratio", .data = &sysctl_tcp_pacing_ss_ratio, @@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_min_tso_segs", + .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &gso_max_segs, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c7c983f0f817..a01c97708d83 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -285,8 +285,6 @@ #include -int sysctl_tcp_min_tso_segs __read_mostly = 2; - int sysctl_tcp_autocorking __read_mostly = 1; struct percpu_counter tcp_orphan_count; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 50ab3a3eced3..6192f26145d3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2503,6 +2503,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_limit_output_bytes = 262144; /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; + net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f018892c6a98..aab6e7145013 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1758,7 +1758,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; return tso_segs ? : - tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); + tcp_tso_autosize(sk, mss_now, + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); } /* Returns the portion of skb which can be sent right away */ -- cgit v1.2.3 From bd239704295c66196e6b77c5717ec4aec076ddd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:28 -0700 Subject: tcp: Namespace-ify sysctl_tcp_min_rtt_wlen Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 3 +-- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 00b4aea3705b..029692d2e4ae 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, - { - .procname = "tcp_min_rtt_wlen", - .data = &sysctl_tcp_min_rtt_wlen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &one, .extra2 = &gso_max_segs, }, + { + .procname = "tcp_min_rtt_wlen", + .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 928048a4e2c5..da1ef666d1f9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -80,7 +80,6 @@ #include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_min_rtt_wlen __read_mostly = 300; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ @@ -2915,8 +2914,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { + u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; struct tcp_sock *tp = tcp_sk(sk); - u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6192f26145d3..ced35af5737a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2504,6 +2504,7 @@ static int __net_init tcp_sk_init(struct net *net) /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; + net->ipv4.sysctl_tcp_min_rtt_wlen = 300; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From 790f00e19f65673c3c169dfc137c09a9236847d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:29 -0700 Subject: tcp: Namespace-ify sysctl_tcp_autocorking Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 18 +++++++++--------- net/ipv4/tcp.c | 4 +--- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 029692d2e4ae..43a18a317053 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -510,15 +510,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &zero, .extra2 = &thousand, }, - { - .procname = "tcp_autocorking", - .data = &sysctl_tcp_autocorking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, { .procname = "tcp_invalid_ratelimit", .data = &sysctl_tcp_invalid_ratelimit, @@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_autocorking", + .data = &init_net.ipv4.sysctl_tcp_autocorking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a01c97708d83..a7a0f316eb86 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -285,8 +285,6 @@ #include -int sysctl_tcp_autocorking __read_mostly = 1; - struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -697,7 +695,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && - sysctl_tcp_autocorking && + sock_net(sk)->ipv4.sysctl_tcp_autocorking && skb != tcp_write_queue_head(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ced35af5737a..351e3497c8f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2505,6 +2505,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_min_rtt_wlen = 300; + net->ipv4.sysctl_tcp_autocorking = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From 4170ba6b589ced82da56c7e4f71cc84b2be036d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:30 -0700 Subject: tcp: Namespace-ify sysctl_tcp_invalid_ratelimit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_input.c | 3 +-- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 43a18a317053..6a9349c27f00 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -510,13 +510,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &zero, .extra2 = &thousand, }, - { - .procname = "tcp_invalid_ratelimit", - .data = &sysctl_tcp_invalid_ratelimit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, { .procname = "tcp_available_ulp", .maxlen = TCP_ULP_BUF_MAX, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "tcp_invalid_ratelimit", + .data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index da1ef666d1f9..db4d458d0205 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -80,7 +80,6 @@ #include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -3403,7 +3402,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, if (*last_oow_ack_time) { s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 351e3497c8f3..6617aae18ba2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2506,6 +2506,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_min_rtt_wlen = 300; net->ipv4.sysctl_tcp_autocorking = 1; + net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From 23a7102a2d1068508fa2a0ce593a0df7f8fdc0ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:31 -0700 Subject: tcp: Namespace-ify sysctl_tcp_pacing_ss_ratio Also remove an obsolete comment about TCP pacing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 18 +++++++++--------- net/ipv4/tcp_input.c | 9 +-------- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 11 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6a9349c27f00..7f0dba852d47 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -492,15 +492,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, - { - .procname = "tcp_pacing_ss_ratio", - .data = &sysctl_tcp_pacing_ss_ratio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &thousand, - }, { .procname = "tcp_pacing_ca_ratio", .data = &sysctl_tcp_pacing_ca_ratio, @@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, + { + .procname = "tcp_pacing_ss_ratio", + .data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index db4d458d0205..29539d39e61a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -767,13 +767,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->srtt_us = max(1U, srtt); } -/* Set the sk_pacing_rate to allow proper sizing of TSO packets. - * Note: TCP stack does not yet implement pacing. - * FQ packet scheduler can be used to implement cheap but effective - * TCP pacing, to smooth the burst on large writes when packets - * in flight is significantly lower than cwnd (or rwin) - */ -int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; static void tcp_update_pacing_rate(struct sock *sk) @@ -793,7 +786,7 @@ static void tcp_update_pacing_rate(struct sock *sk) * end of slow start and should slow down. */ if (tp->snd_cwnd < tp->snd_ssthresh / 2) - rate *= sysctl_tcp_pacing_ss_ratio; + rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; else rate *= sysctl_tcp_pacing_ca_ratio; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6617aae18ba2..1d8fc663af51 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2507,6 +2507,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_min_rtt_wlen = 300; net->ipv4.sysctl_tcp_autocorking = 1; net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; + net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From c26e91f8b9b8e1fd252e07c1f60e50220cd7ebab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:32 -0700 Subject: tcp: Namespace-ify sysctl_tcp_pacing_ca_ratio Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 18 +++++++++--------- net/ipv4/tcp_input.c | 4 +--- net/ipv4/tcp_ipv4.c | 1 + 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7f0dba852d47..4602af6d5358 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -492,15 +492,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, - { - .procname = "tcp_pacing_ca_ratio", - .data = &sysctl_tcp_pacing_ca_ratio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &thousand, - }, { .procname = "tcp_available_ulp", .maxlen = TCP_ULP_BUF_MAX, @@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &zero, .extra2 = &thousand, }, + { + .procname = "tcp_pacing_ca_ratio", + .data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 29539d39e61a..21c358c0cf2e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -767,8 +767,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->srtt_us = max(1U, srtt); } -int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; - static void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -788,7 +786,7 @@ static void tcp_update_pacing_rate(struct sock *sk) if (tp->snd_cwnd < tp->snd_ssthresh / 2) rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; else - rate *= sysctl_tcp_pacing_ca_ratio; + rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; rate *= max(tp->snd_cwnd, tp->packets_out); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1d8fc663af51..7c1dae6493c3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2508,6 +2508,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_autocorking = 1; net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; + net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3 From e19b42a1a0669ed5b8009930c5269a5a87cc363c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 27 Oct 2017 13:19:36 +0300 Subject: bridge: netlink: make setlink/dellink notifications more accurate Before this patch we had cases that either sent notifications when there were in fact no changes (e.g. non-existent vlan delete) or didn't send notifications when there were changes (e.g. vlan add range with an error in the middle, port flags change + vlan update error). This patch sends down a boolean to the functions setlink/dellink use and if there is even a single configuration change (port flag, vlan add/del, port state) then we always send a notification. This is all done to keep backwards compatibility with the opportunistic vlan delete, where one could specify a vlan range that has missing vlans inside and still everything in that range will be cleared, this is mostly used to clear the whole vlan config with a single call, i.e. range 1-4094. Signed-off-by: Nikolay Aleksandrov Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 44 +++++++++++++++++++++++++----------------- net/bridge/br_netlink_tunnel.c | 14 +++++++++----- net/bridge/br_private_tunnel.h | 3 ++- 3 files changed, 37 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index fb61b6c79235..d0290ede9342 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -506,7 +506,7 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, } static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, - int cmd, struct bridge_vlan_info *vinfo) + int cmd, struct bridge_vlan_info *vinfo, bool *changed) { int err = 0; @@ -517,21 +517,24 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, * per-VLAN entry as well */ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags); - if (err) - break; } else { vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; err = br_vlan_add(br, vinfo->vid, vinfo->flags); } + if (!err) + *changed = true; break; case RTM_DELLINK: if (p) { - nbp_vlan_delete(p, vinfo->vid); - if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER) - br_vlan_delete(p->br, vinfo->vid); - } else { - br_vlan_delete(br, vinfo->vid); + if (!nbp_vlan_delete(p, vinfo->vid)) + *changed = true; + + if ((vinfo->flags & BRIDGE_VLAN_INFO_MASTER) && + !br_vlan_delete(p->br, vinfo->vid)) + *changed = true; + } else if (!br_vlan_delete(br, vinfo->vid)) { + *changed = true; } break; } @@ -542,7 +545,8 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, static int br_process_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo_curr, - struct bridge_vlan_info **vinfo_last) + struct bridge_vlan_info **vinfo_last, + bool *changed) { if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK) return -EINVAL; @@ -572,7 +576,7 @@ static int br_process_vlan_info(struct net_bridge *br, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { tmp_vinfo.vid = v; - err = br_vlan_info(br, p, cmd, &tmp_vinfo); + err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed); if (err) break; } @@ -581,13 +585,13 @@ static int br_process_vlan_info(struct net_bridge *br, return err; } - return br_vlan_info(br, p, cmd, vinfo_curr); + return br_vlan_info(br, p, cmd, vinfo_curr, changed); } static int br_afspec(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *af_spec, - int cmd) + int cmd, bool *changed) { struct bridge_vlan_info *vinfo_curr = NULL; struct bridge_vlan_info *vinfo_last = NULL; @@ -607,7 +611,8 @@ static int br_afspec(struct net_bridge *br, return err; err = br_process_vlan_tunnel_info(br, p, cmd, &tinfo_curr, - &tinfo_last); + &tinfo_last, + changed); if (err) return err; break; @@ -616,7 +621,7 @@ static int br_afspec(struct net_bridge *br, return -EINVAL; vinfo_curr = nla_data(attr); err = br_process_vlan_info(br, p, cmd, vinfo_curr, - &vinfo_last); + &vinfo_last, changed); if (err) return err; break; @@ -804,6 +809,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) struct nlattr *afspec; struct net_bridge_port *p; struct nlattr *tb[IFLA_BRPORT_MAX + 1]; + bool changed = false; int err = 0; protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); @@ -839,14 +845,15 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) } if (err) goto out; + changed = true; } if (afspec) { err = br_afspec((struct net_bridge *)netdev_priv(dev), p, - afspec, RTM_SETLINK); + afspec, RTM_SETLINK, &changed); } - if (err == 0) + if (changed) br_ifinfo_notify(RTM_NEWLINK, p); out: return err; @@ -857,6 +864,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { struct nlattr *afspec; struct net_bridge_port *p; + bool changed = false; int err = 0; afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); @@ -869,8 +877,8 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) return -EINVAL; err = br_afspec((struct net_bridge *)netdev_priv(dev), p, - afspec, RTM_DELLINK); - if (err == 0) + afspec, RTM_DELLINK, &changed); + if (changed) /* Send RTM_NEWLINK because userspace * expects RTM_NEWLINK for vlan dels */ diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index 3712c7f0e00c..da8cb99fd259 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -198,7 +198,7 @@ static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + }; static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd, - u16 vid, u32 tun_id) + u16 vid, u32 tun_id, bool *changed) { int err = 0; @@ -208,9 +208,12 @@ static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd, switch (cmd) { case RTM_SETLINK: err = nbp_vlan_tunnel_info_add(p, vid, tun_id); + if (!err) + *changed = true; break; case RTM_DELLINK: - nbp_vlan_tunnel_info_delete(p, vid); + if (!nbp_vlan_tunnel_info_delete(p, vid)) + *changed = true; break; } @@ -254,7 +257,8 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr, int br_process_vlan_tunnel_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, - struct vtunnel_info *tinfo_last) + struct vtunnel_info *tinfo_last, + bool *changed) { int err; @@ -272,7 +276,7 @@ int br_process_vlan_tunnel_info(struct net_bridge *br, return -EINVAL; t = tinfo_last->tunid; for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) { - err = br_vlan_tunnel_info(p, cmd, v, t); + err = br_vlan_tunnel_info(p, cmd, v, t, changed); if (err) return err; t++; @@ -283,7 +287,7 @@ int br_process_vlan_tunnel_info(struct net_bridge *br, if (tinfo_last->flags) return -EINVAL; err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid, - tinfo_curr->tunid); + tinfo_curr->tunid, changed); if (err) return err; memset(tinfo_last, 0, sizeof(struct vtunnel_info)); diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h index 4a447a378ab3..a259471bfd78 100644 --- a/net/bridge/br_private_tunnel.h +++ b/net/bridge/br_private_tunnel.h @@ -26,7 +26,8 @@ int br_process_vlan_tunnel_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, - struct vtunnel_info *tinfo_last); + struct vtunnel_info *tinfo_last, + bool *changed); int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg); int br_fill_vlan_tunnel_info(struct sk_buff *skb, struct net_bridge_vlan_group *vg); -- cgit v1.2.3 From f418af6343fbfaaa3306e7cf15906be4f20c69ff Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 27 Oct 2017 13:19:37 +0300 Subject: bridge: vlan: signal if anything changed on vlan add Before this patch there was no way to tell if the vlan add operation actually changed anything, thus we would always generate a notification on adds. Let's make the notifications more precise and generate them only if anything changed, so use the new bool parameter to signal that the vlan was updated. We cannot return an error because there are valid use cases that will be broken (e.g. overlapping range add) and also we can't risk masking errors due to calls into drivers for vlan add which can potentially return anything. Signed-off-by: Nikolay Aleksandrov Reviewed-by: Toshiaki Makita Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 9 ++++-- net/bridge/br_private.h | 14 ++++++--- net/bridge/br_vlan.c | 78 ++++++++++++++++++++++++++++++++++++------------- 3 files changed, 73 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d0290ede9342..e732403669c6 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -508,6 +508,7 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo, bool *changed) { + bool curr_change; int err = 0; switch (cmd) { @@ -516,12 +517,14 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, /* if the MASTER flag is set this will act on the global * per-VLAN entry as well */ - err = nbp_vlan_add(p, vinfo->vid, vinfo->flags); + err = nbp_vlan_add(p, vinfo->vid, vinfo->flags, + &curr_change); } else { vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; - err = br_vlan_add(br, vinfo->vid, vinfo->flags); + err = br_vlan_add(br, vinfo->vid, vinfo->flags, + &curr_change); } - if (!err) + if (curr_change) *changed = true; break; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index fa0039f44818..860e4afaf71a 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -803,7 +803,8 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *port, struct net_bridge_vlan_group *vg, struct sk_buff *skb); -int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags); +int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, + bool *changed); int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); @@ -816,7 +817,8 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); -int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); +int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, + bool *changed); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); int nbp_vlan_init(struct net_bridge_port *port); @@ -903,8 +905,10 @@ static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, return skb; } -static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) +static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, + bool *changed) { + *changed = false; return -EOPNOTSUPP; } @@ -926,8 +930,10 @@ static inline int br_vlan_init(struct net_bridge *br) return 0; } -static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) +static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, + bool *changed) { + *changed = false; return -EOPNOTSUPP; } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 233a30040c91..51935270c651 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -32,27 +32,34 @@ static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } -static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid) +static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid) { if (vg->pvid == vid) - return; + return false; smp_wmb(); vg->pvid = vid; + + return true; } -static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) +static bool __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { if (vg->pvid != vid) - return; + return false; smp_wmb(); vg->pvid = 0; + + return true; } -static void __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) +/* return true if anything changed, false otherwise */ +static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) { struct net_bridge_vlan_group *vg; + u16 old_flags = v->flags; + bool ret; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); @@ -60,14 +67,16 @@ static void __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) vg = nbp_vlan_group(v->port); if (flags & BRIDGE_VLAN_INFO_PVID) - __vlan_add_pvid(vg, v->vid); + ret = __vlan_add_pvid(vg, v->vid); else - __vlan_delete_pvid(vg, v->vid); + ret = __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; + + return ret || !!(old_flags ^ v->flags); } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, @@ -151,8 +160,10 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid vg = br_vlan_group(br); masterv = br_vlan_find(vg, vid); if (!masterv) { + bool changed; + /* missing global ctx, create it now */ - if (br_vlan_add(br, vid, 0)) + if (br_vlan_add(br, vid, 0, &changed)) return NULL; masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) @@ -232,8 +243,11 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) /* need to work on the master vlan too */ if (flags & BRIDGE_VLAN_INFO_MASTER) { - err = br_vlan_add(br, v->vid, flags | - BRIDGE_VLAN_INFO_BRENTRY); + bool changed; + + err = br_vlan_add(br, v->vid, + flags | BRIDGE_VLAN_INFO_BRENTRY, + &changed); if (err) goto out_filt; } @@ -550,8 +564,9 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. + * changed must be true only if the vlan was created or updated */ -int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) +int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; @@ -559,6 +574,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) ASSERT_RTNL(); + *changed = false; vg = br_vlan_group(br); vlan = br_vlan_find(vg, vid); if (vlan) { @@ -576,8 +592,11 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) refcount_inc(&vlan->refcnt); vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans++; + *changed = true; } - __vlan_add_flags(vlan, flags); + if (__vlan_add_flags(vlan, flags)) + *changed = true; + return 0; } @@ -600,6 +619,8 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) if (ret) { free_percpu(vlan->stats); kfree(vlan); + } else { + *changed = true; } return ret; @@ -824,9 +845,10 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) const struct net_bridge_vlan *pvent; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; + unsigned long *changed; + bool vlchange; u16 old_pvid; int err = 0; - unsigned long *changed; if (!pvid) { br_vlan_disable_default_pvid(br); @@ -850,7 +872,8 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) err = br_vlan_add(br, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | - BRIDGE_VLAN_INFO_BRENTRY); + BRIDGE_VLAN_INFO_BRENTRY, + &vlchange); if (err) goto out; br_vlan_delete(br, old_pvid); @@ -869,7 +892,8 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) err = nbp_vlan_add(p, pvid, BRIDGE_VLAN_INFO_PVID | - BRIDGE_VLAN_INFO_UNTAGGED); + BRIDGE_VLAN_INFO_UNTAGGED, + &vlchange); if (err) goto err_port; nbp_vlan_delete(p, old_pvid); @@ -890,7 +914,8 @@ err_port: if (old_pvid) nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | - BRIDGE_VLAN_INFO_UNTAGGED); + BRIDGE_VLAN_INFO_UNTAGGED, + &vlchange); nbp_vlan_delete(p, pvid); } @@ -899,7 +924,8 @@ err_port: br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | - BRIDGE_VLAN_INFO_BRENTRY); + BRIDGE_VLAN_INFO_BRENTRY, + &vlchange); br_vlan_delete(br, pvid); } goto out; @@ -931,6 +957,7 @@ int br_vlan_init(struct net_bridge *br) { struct net_bridge_vlan_group *vg; int ret = -ENOMEM; + bool changed; vg = kzalloc(sizeof(*vg), GFP_KERNEL); if (!vg) @@ -947,7 +974,7 @@ int br_vlan_init(struct net_bridge *br) rcu_assign_pointer(br->vlgrp, vg); ret = br_vlan_add(br, 1, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | - BRIDGE_VLAN_INFO_BRENTRY); + BRIDGE_VLAN_INFO_BRENTRY, &changed); if (ret) goto err_vlan_add; @@ -992,9 +1019,12 @@ int nbp_vlan_init(struct net_bridge_port *p) INIT_LIST_HEAD(&vg->vlan_list); rcu_assign_pointer(p->vlgrp, vg); if (p->br->default_pvid) { + bool changed; + ret = nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | - BRIDGE_VLAN_INFO_UNTAGGED); + BRIDGE_VLAN_INFO_UNTAGGED, + &changed); if (ret) goto err_vlan_add; } @@ -1016,8 +1046,10 @@ err_vlan_enabled: /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. + * changed must be true only if the vlan was created or updated */ -int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) +int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, + bool *changed) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = port->dev, @@ -1031,13 +1063,15 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) ASSERT_RTNL(); + *changed = false; vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { /* Pass the flags to the hardware bridge */ ret = switchdev_port_obj_add(port->dev, &v.obj); if (ret && ret != -EOPNOTSUPP) return ret; - __vlan_add_flags(vlan, flags); + *changed = __vlan_add_flags(vlan, flags); + return 0; } @@ -1050,6 +1084,8 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) ret = __vlan_add(vlan, flags); if (ret) kfree(vlan); + else + *changed = true; return ret; } -- cgit v1.2.3 From 949cf8b1dd39f6d306a681503b6031299fd7ced8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 29 Oct 2017 11:14:08 +0900 Subject: tcp: Remove "linux/unaligned/access_ok.h" include. This causes build failures: In file included from net/ipv4/tcp_input.c:79:0: ./include/linux/unaligned/access_ok.h:7:28: error: redefinition of 'get_unaligned_le16' In file included from ./include/asm-generic/unaligned.h:17:0, from ./arch/arm/include/generated/asm/unaligned.h:1, from net/ipv4/tcp_input.c:76: ./include/linux/unaligned/le_struct.h:6:19: note: previous definition of 'get_unaligned_le16' was here In file included from net/ipv4/tcp_input.c:79:0: ./include/linux/unaligned/access_ok.h:12:28: error: redefinition of 'get_unaligned_le32' Plain "asm/access_ok.h", which is already included, is sufficient. Fixes: 60e2a7780793 ("tcp: TCP experimental option for SMC") Reported-by: Egil Hjelmeland Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 21c358c0cf2e..b62a7d1707ae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -76,7 +76,6 @@ #include #include #include -#include #include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -- cgit v1.2.3 From 3953ae7b218df4d1e544b98a393666f9ae58a78c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 27 Oct 2017 16:51:50 +0200 Subject: l2tp: don't register sessions in l2tp_session_create() Sessions created by l2tp_session_create() aren't fully initialised: some pseudo-wire specific operations need to be done before making the session usable. Therefore the PPP and Ethernet pseudo-wires continue working on the returned l2tp session while it's already been exposed to the rest of the system. This can lead to various issues. In particular, the session may enter the deletion process before having been fully initialised, which will confuse the session removal code. This patch moves session registration out of l2tp_session_create(), so that callers can control when the session is exposed to the rest of the system. This is done by the new l2tp_session_register() function. Only pppol2tp_session_create() can be easily converted to avoid modifying its session after registration (the debug message is dropped in order to avoid the need for holding a reference on the session). For pppol2tp_connect() and l2tp_eth_create()), more work is needed. That'll be done in followup patches. For now, let's just register the session right after its creation, like it was done before. The only difference is that we can easily take a reference on the session before registering it, so, at least, we're sure it's not going to be freed while we're working on it. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 21 +++++++-------------- net/l2tp/l2tp_core.h | 3 +++ net/l2tp/l2tp_eth.c | 9 +++++++++ net/l2tp/l2tp_ppp.c | 23 +++++++++++++++++------ 4 files changed, 36 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index af22aa8ae35b..a1d56e143fcd 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -322,8 +322,8 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, } EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname); -static int l2tp_session_add_to_tunnel(struct l2tp_tunnel *tunnel, - struct l2tp_session *session) +int l2tp_session_register(struct l2tp_session *session, + struct l2tp_tunnel *tunnel) { struct l2tp_session *session_walk; struct hlist_head *g_head; @@ -371,6 +371,10 @@ static int l2tp_session_add_to_tunnel(struct l2tp_tunnel *tunnel, hlist_add_head(&session->hlist, head); write_unlock_bh(&tunnel->hlist_lock); + /* Ignore management session in session count value */ + if (session->session_id != 0) + atomic_inc(&l2tp_session_count); + return 0; err_tlock_pnlock: @@ -380,6 +384,7 @@ err_tlock: return err; } +EXPORT_SYMBOL_GPL(l2tp_session_register); /* Lookup a tunnel by id */ @@ -1788,7 +1793,6 @@ EXPORT_SYMBOL_GPL(l2tp_session_set_header_len); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { struct l2tp_session *session; - int err; session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL); if (session != NULL) { @@ -1846,17 +1850,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn refcount_set(&session->ref_count, 1); - err = l2tp_session_add_to_tunnel(tunnel, session); - if (err) { - kfree(session); - - return ERR_PTR(err); - } - - /* Ignore management session in session count value */ - if (session->session_id != 0) - atomic_inc(&l2tp_session_count); - return session; } diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 67c79d9b5c6c..77caa5966736 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -263,6 +263,9 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); +int l2tp_session_register(struct l2tp_session *session, + struct l2tp_tunnel *tunnel); + void __l2tp_session_unhash(struct l2tp_session *session); int l2tp_session_delete(struct l2tp_session *session); void l2tp_session_free(struct l2tp_session *session); diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 014a7bc2a872..a7d76f5f31ff 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -271,6 +271,13 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, goto out; } + l2tp_session_inc_refcount(session); + rc = l2tp_session_register(session, tunnel); + if (rc < 0) { + kfree(session); + goto out; + } + dev = alloc_netdev(sizeof(*priv), name, name_assign_type, l2tp_eth_dev_setup); if (!dev) { @@ -304,6 +311,7 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, __module_get(THIS_MODULE); /* Must be done after register_netdev() */ strlcpy(session->ifname, dev->name, IFNAMSIZ); + l2tp_session_dec_refcount(session); dev_hold(dev); @@ -314,6 +322,7 @@ out_del_dev: spriv->dev = NULL; out_del_session: l2tp_session_delete(session); + l2tp_session_dec_refcount(session); out: return rc; } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index f50452b919d5..40cf7a78e331 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -715,6 +715,14 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, error = PTR_ERR(session); goto end; } + + l2tp_session_inc_refcount(session); + error = l2tp_session_register(session, tunnel); + if (error < 0) { + kfree(session); + goto end; + } + drop_refcnt = true; } /* Associate session with its PPPoL2TP socket */ @@ -800,7 +808,7 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, /* Error if tunnel socket is not prepped */ if (!tunnel->sock) { error = -ENOENT; - goto out; + goto err; } /* Default MTU values. */ @@ -815,18 +823,21 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, peer_session_id, cfg); if (IS_ERR(session)) { error = PTR_ERR(session); - goto out; + goto err; } ps = l2tp_session_priv(session); ps->tunnel_sock = tunnel->sock; - l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n", - session->name); + error = l2tp_session_register(session, tunnel); + if (error < 0) + goto err_sess; - error = 0; + return 0; -out: +err_sess: + kfree(session); +err: return error; } -- cgit v1.2.3 From ee28de6bbd78c2e18111a0aef43ea746f28d2073 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 27 Oct 2017 16:51:51 +0200 Subject: l2tp: initialise l2tp_eth sessions before registering them Sessions must be initialised before being made externally visible by l2tp_session_register(). Otherwise the session may be concurrently deleted before being initialised, which can confuse the deletion path and eventually lead to kernel oops. Therefore, we need to move l2tp_session_register() down in l2tp_eth_create(), but also handle the intermediate step where only the session or the netdevice has been registered. We can't just call l2tp_session_register() in ->ndo_init() because we'd have no way to properly undo this operation in ->ndo_uninit(). Instead, let's register the session and the netdevice in two different steps and protect the session's device pointer with RCU. And now that we allow the session's .dev field to be NULL, we don't need to prevent the netdevice from being removed anymore. So we can drop the dev_hold() and dev_put() calls in l2tp_eth_create() and l2tp_eth_dev_uninit(). Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 106 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index a7d76f5f31ff..d29bfee291cb 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -54,7 +54,7 @@ struct l2tp_eth { /* via l2tp_session_priv() */ struct l2tp_eth_sess { - struct net_device *dev; + struct net_device __rcu *dev; }; @@ -72,7 +72,14 @@ static int l2tp_eth_dev_init(struct net_device *dev) static void l2tp_eth_dev_uninit(struct net_device *dev) { - dev_put(dev); + struct l2tp_eth *priv = netdev_priv(dev); + struct l2tp_eth_sess *spriv; + + spriv = l2tp_session_priv(priv->session); + RCU_INIT_POINTER(spriv->dev, NULL); + /* No need for synchronize_net() here. We're called by + * unregister_netdev*(), which does the synchronisation for us. + */ } static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) @@ -130,8 +137,8 @@ static void l2tp_eth_dev_setup(struct net_device *dev) static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) { struct l2tp_eth_sess *spriv = l2tp_session_priv(session); - struct net_device *dev = spriv->dev; - struct l2tp_eth *priv = netdev_priv(dev); + struct net_device *dev; + struct l2tp_eth *priv; if (session->debug & L2TP_MSG_DATA) { unsigned int length; @@ -155,16 +162,25 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, skb_dst_drop(skb); nf_reset(skb); + rcu_read_lock(); + dev = rcu_dereference(spriv->dev); + if (!dev) + goto error_rcu; + + priv = netdev_priv(dev); if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) { atomic_long_inc(&priv->rx_packets); atomic_long_add(data_len, &priv->rx_bytes); } else { atomic_long_inc(&priv->rx_errors); } + rcu_read_unlock(); + return; +error_rcu: + rcu_read_unlock(); error: - atomic_long_inc(&priv->rx_errors); kfree_skb(skb); } @@ -175,11 +191,15 @@ static void l2tp_eth_delete(struct l2tp_session *session) if (session) { spriv = l2tp_session_priv(session); - dev = spriv->dev; + + rtnl_lock(); + dev = rtnl_dereference(spriv->dev); if (dev) { - unregister_netdev(dev); - spriv->dev = NULL; + unregister_netdevice(dev); + rtnl_unlock(); module_put(THIS_MODULE); + } else { + rtnl_unlock(); } } } @@ -189,9 +209,20 @@ static void l2tp_eth_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; struct l2tp_eth_sess *spriv = l2tp_session_priv(session); - struct net_device *dev = spriv->dev; + struct net_device *dev; + + rcu_read_lock(); + dev = rcu_dereference(spriv->dev); + if (!dev) { + rcu_read_unlock(); + return; + } + dev_hold(dev); + rcu_read_unlock(); seq_printf(m, " interface %s\n", dev->name); + + dev_put(dev); } #endif @@ -268,21 +299,14 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, peer_session_id, cfg); if (IS_ERR(session)) { rc = PTR_ERR(session); - goto out; - } - - l2tp_session_inc_refcount(session); - rc = l2tp_session_register(session, tunnel); - if (rc < 0) { - kfree(session); - goto out; + goto err; } dev = alloc_netdev(sizeof(*priv), name, name_assign_type, l2tp_eth_dev_setup); if (!dev) { rc = -ENOMEM; - goto out_del_session; + goto err_sess; } dev_net_set(dev, net); @@ -302,28 +326,48 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, #endif spriv = l2tp_session_priv(session); - spriv->dev = dev; - rc = register_netdev(dev); - if (rc < 0) - goto out_del_dev; + l2tp_session_inc_refcount(session); + + rtnl_lock(); + + /* Register both device and session while holding the rtnl lock. This + * ensures that l2tp_eth_delete() will see that there's a device to + * unregister, even if it happened to run before we assign spriv->dev. + */ + rc = l2tp_session_register(session, tunnel); + if (rc < 0) { + rtnl_unlock(); + goto err_sess_dev; + } + + rc = register_netdevice(dev); + if (rc < 0) { + rtnl_unlock(); + l2tp_session_delete(session); + l2tp_session_dec_refcount(session); + free_netdev(dev); + + return rc; + } - __module_get(THIS_MODULE); - /* Must be done after register_netdev() */ strlcpy(session->ifname, dev->name, IFNAMSIZ); + rcu_assign_pointer(spriv->dev, dev); + + rtnl_unlock(); + l2tp_session_dec_refcount(session); - dev_hold(dev); + __module_get(THIS_MODULE); return 0; -out_del_dev: - free_netdev(dev); - spriv->dev = NULL; -out_del_session: - l2tp_session_delete(session); +err_sess_dev: l2tp_session_dec_refcount(session); -out: + free_netdev(dev); +err_sess: + kfree(session); +err: return rc; } -- cgit v1.2.3 From ee40fb2e1eb5bc0ddd3f2f83c6e39a454ef5a741 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 27 Oct 2017 16:51:52 +0200 Subject: l2tp: protect sock pointer of struct pppol2tp_session with RCU pppol2tp_session_create() registers sessions that can't have their corresponding socket initialised. This socket has to be created by userspace, then connected to the session by pppol2tp_connect(). Therefore, we need to protect the pppol2tp socket pointer of L2TP sessions, so that it can safely be updated when userspace is connecting or closing the socket. This will eventually allow pppol2tp_connect() to avoid generating transient states while initialising its parts of the session. To this end, this patch protects the pppol2tp socket pointer using RCU. The pppol2tp socket pointer is still set in pppol2tp_connect(), but only once we know the function isn't going to fail. It's eventually reset by pppol2tp_release(), which now has to wait for a grace period to elapse before it can drop the last reference on the socket. This ensures that pppol2tp_session_get_sock() can safely grab a reference on the socket, even after ps->sk is reset to NULL but before this operation actually gets visible from pppol2tp_session_get_sock(). The rest is standard RCU conversion: pppol2tp_recv(), which already runs in atomic context, is simply enclosed by rcu_read_lock() and rcu_read_unlock(), while other functions are converted to use pppol2tp_session_get_sock() followed by sock_put(). pppol2tp_session_setsockopt() is a special case. It used to retrieve the pppol2tp socket from the L2TP session, which itself was retrieved from the pppol2tp socket. Therefore we can just avoid dereferencing ps->sk and directly use the original socket pointer instead. With all users of ps->sk now handling NULL and concurrent updates, the L2TP ->ref() and ->deref() callbacks aren't needed anymore. Therefore, rather than converting pppol2tp_session_sock_hold() and pppol2tp_session_sock_put(), we can just drop them. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 154 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 101 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 40cf7a78e331..d40d5492c148 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -122,8 +122,11 @@ struct pppol2tp_session { int owner; /* pid that opened the socket */ - struct sock *sock; /* Pointer to the session + struct mutex sk_lock; /* Protects .sk */ + struct sock __rcu *sk; /* Pointer to the session * PPPoX socket */ + struct sock *__sk; /* Copy of .sk, for cleanup */ + struct rcu_head rcu; /* For asynchronous release */ struct sock *tunnel_sock; /* Pointer to the tunnel UDP * socket */ int flags; /* accessed by PPPIOCGFLAGS. @@ -138,6 +141,24 @@ static const struct ppp_channel_ops pppol2tp_chan_ops = { static const struct proto_ops pppol2tp_ops; +/* Retrieves the pppol2tp socket associated to a session. + * A reference is held on the returned socket, so this function must be paired + * with sock_put(). + */ +static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session) +{ + struct pppol2tp_session *ps = l2tp_session_priv(session); + struct sock *sk; + + rcu_read_lock(); + sk = rcu_dereference(ps->sk); + if (sk) + sock_hold(sk); + rcu_read_unlock(); + + return sk; +} + /* Helpers to obtain tunnel/session contexts from sockets. */ static inline struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) @@ -224,7 +245,8 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int /* If the socket is bound, send it in to PPP's input queue. Otherwise * queue it on the session socket. */ - sk = ps->sock; + rcu_read_lock(); + sk = rcu_dereference(ps->sk); if (sk == NULL) goto no_sock; @@ -247,30 +269,16 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int kfree_skb(skb); } } + rcu_read_unlock(); return; no_sock: + rcu_read_unlock(); l2tp_info(session, L2TP_MSG_DATA, "%s: no socket\n", session->name); kfree_skb(skb); } -static void pppol2tp_session_sock_hold(struct l2tp_session *session) -{ - struct pppol2tp_session *ps = l2tp_session_priv(session); - - if (ps->sock) - sock_hold(ps->sock); -} - -static void pppol2tp_session_sock_put(struct l2tp_session *session) -{ - struct pppol2tp_session *ps = l2tp_session_priv(session); - - if (ps->sock) - sock_put(ps->sock); -} - /************************************************************************ * Transmit handling ***********************************************************************/ @@ -431,14 +439,16 @@ abort: */ static void pppol2tp_session_close(struct l2tp_session *session) { - struct pppol2tp_session *ps = l2tp_session_priv(session); - struct sock *sk = ps->sock; - struct socket *sock = sk->sk_socket; + struct sock *sk; BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) - inet_shutdown(sock, SEND_SHUTDOWN); + sk = pppol2tp_session_get_sock(session); + if (sk) { + if (sk->sk_socket) + inet_shutdown(sk->sk_socket, SEND_SHUTDOWN); + sock_put(sk); + } /* Don't let the session go away before our socket does */ l2tp_session_inc_refcount(session); @@ -461,6 +471,14 @@ static void pppol2tp_session_destruct(struct sock *sk) } } +static void pppol2tp_put_sk(struct rcu_head *head) +{ + struct pppol2tp_session *ps; + + ps = container_of(head, typeof(*ps), rcu); + sock_put(ps->__sk); +} + /* Called when the PPPoX socket (session) is closed. */ static int pppol2tp_release(struct socket *sock) @@ -486,11 +504,24 @@ static int pppol2tp_release(struct socket *sock) session = pppol2tp_sock_to_session(sk); - /* Purge any queued data */ if (session != NULL) { + struct pppol2tp_session *ps; + __l2tp_session_unhash(session); l2tp_session_queue_purge(session); - sock_put(sk); + + ps = l2tp_session_priv(session); + mutex_lock(&ps->sk_lock); + ps->__sk = rcu_dereference_protected(ps->sk, + lockdep_is_held(&ps->sk_lock)); + RCU_INIT_POINTER(ps->sk, NULL); + mutex_unlock(&ps->sk_lock); + call_rcu(&ps->rcu, pppol2tp_put_sk); + + /* Rely on the sock_put() call at the end of the function for + * dropping the reference held by pppol2tp_sock_to_session(). + * The last reference will be dropped by pppol2tp_put_sk(). + */ } release_sock(sk); @@ -557,12 +588,14 @@ out: static void pppol2tp_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; - struct pppol2tp_session *ps = l2tp_session_priv(session); + struct sock *sk; + + sk = pppol2tp_session_get_sock(session); + if (sk) { + struct pppox_sock *po = pppox_sk(sk); - if (ps) { - struct pppox_sock *po = pppox_sk(ps->sock); - if (po) - seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); + seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); + sock_put(sk); } } #endif @@ -693,13 +726,17 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, /* Using a pre-existing session is fine as long as it hasn't * been connected yet. */ - if (ps->sock) { + mutex_lock(&ps->sk_lock); + if (rcu_dereference_protected(ps->sk, + lockdep_is_held(&ps->sk_lock))) { + mutex_unlock(&ps->sk_lock); error = -EEXIST; goto end; } /* consistency checks */ if (ps->tunnel_sock != tunnel->sock) { + mutex_unlock(&ps->sk_lock); error = -EEXIST; goto end; } @@ -716,19 +753,21 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } + ps = l2tp_session_priv(session); + mutex_init(&ps->sk_lock); l2tp_session_inc_refcount(session); + + mutex_lock(&ps->sk_lock); error = l2tp_session_register(session, tunnel); if (error < 0) { + mutex_unlock(&ps->sk_lock); kfree(session); goto end; } drop_refcnt = true; } - /* Associate session with its PPPoL2TP socket */ - ps = l2tp_session_priv(session); ps->owner = current->pid; - ps->sock = sk; ps->tunnel_sock = tunnel->sock; session->recv_skb = pppol2tp_recv; @@ -737,12 +776,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, session->show = pppol2tp_show; #endif - /* We need to know each time a skb is dropped from the reorder - * queue. - */ - session->ref = pppol2tp_session_sock_hold; - session->deref = pppol2tp_session_sock_put; - /* If PMTU discovery was enabled, use the MTU that was discovered */ dst = sk_dst_get(tunnel->sock); if (dst != NULL) { @@ -776,12 +809,17 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, po->chan.mtu = session->mtu; error = ppp_register_net_channel(sock_net(sk), &po->chan); - if (error) + if (error) { + mutex_unlock(&ps->sk_lock); goto end; + } out_no_ppp: /* This is how we get the session context from the socket. */ sk->sk_user_data = session; + rcu_assign_pointer(ps->sk, sk); + mutex_unlock(&ps->sk_lock); + sk->sk_state = PPPOX_CONNECTED; l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n", session->name); @@ -827,6 +865,7 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, } ps = l2tp_session_priv(session); + mutex_init(&ps->sk_lock); ps->tunnel_sock = tunnel->sock; error = l2tp_session_register(session, tunnel); @@ -998,12 +1037,10 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n", session->name, cmd, arg); - sk = ps->sock; + sk = pppol2tp_session_get_sock(session); if (!sk) return -EBADR; - sock_hold(sk); - switch (cmd) { case SIOCGIFMTU: err = -ENXIO; @@ -1279,7 +1316,6 @@ static int pppol2tp_session_setsockopt(struct sock *sk, int optname, int val) { int err = 0; - struct pppol2tp_session *ps = l2tp_session_priv(session); switch (optname) { case PPPOL2TP_SO_RECVSEQ: @@ -1300,8 +1336,8 @@ static int pppol2tp_session_setsockopt(struct sock *sk, } session->send_seq = !!val; { - struct sock *ssk = ps->sock; - struct pppox_sock *po = pppox_sk(ssk); + struct pppox_sock *po = pppox_sk(sk); + po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ : PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; } @@ -1640,8 +1676,9 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) { struct l2tp_session *session = v; struct l2tp_tunnel *tunnel = session->tunnel; - struct pppol2tp_session *ps = l2tp_session_priv(session); - struct pppox_sock *po = pppox_sk(ps->sock); + unsigned char state; + char user_data_ok; + struct sock *sk; u32 ip = 0; u16 port = 0; @@ -1651,6 +1688,15 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) port = ntohs(inet->inet_sport); } + sk = pppol2tp_session_get_sock(session); + if (sk) { + state = sk->sk_state; + user_data_ok = (session == sk->sk_user_data) ? 'Y' : 'N'; + } else { + state = 0; + user_data_ok = 'N'; + } + seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> " "%04X/%04X %d %c\n", session->name, ip, port, @@ -1658,9 +1704,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) session->session_id, tunnel->peer_tunnel_id, session->peer_session_id, - ps->sock->sk_state, - (session == ps->sock->sk_user_data) ? - 'Y' : 'N'); + state, user_data_ok); seq_printf(m, " %d/%d/%c/%c/%s %08x %u\n", session->mtu, session->mru, session->recv_seq ? 'R' : '-', @@ -1677,8 +1721,12 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) atomic_long_read(&session->stats.rx_bytes), atomic_long_read(&session->stats.rx_errors)); - if (po) + if (sk) { + struct pppox_sock *po = pppox_sk(sk); + seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); + sock_put(sk); + } } static int pppol2tp_seq_show(struct seq_file *m, void *v) -- cgit v1.2.3 From f98be6c6359e7e4a61aaefb9964c1db31cb9ec0c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 27 Oct 2017 16:51:52 +0200 Subject: l2tp: initialise PPP sessions before registering them pppol2tp_connect() initialises L2TP sessions after they've been exposed to the rest of the system by l2tp_session_register(). This puts sessions into transient states that are the source of several races, in particular with session's deletion path. This patch centralises the initialisation code into pppol2tp_session_init(), which is called before the registration phase. The only field that can't be set before session registration is the pppol2tp socket pointer, which has already been converted to RCU. So pppol2tp_connect() should now be race-free. The session's .session_close() callback is now set before registration. Therefore, it's always called when l2tp_core deletes the session, even if it was created by pppol2tp_session_create() and hasn't been plugged to a pppol2tp socket yet. That'd prevent session free because the extra reference taken by pppol2tp_session_close() wouldn't be dropped by the socket's ->sk_destruct() callback (pppol2tp_session_destruct()). We could set .session_close() only while connecting a session to its pppol2tp socket, or teach pppol2tp_session_close() to avoid grabbing a reference when the session isn't connected, but that'd require adding some form of synchronisation to be race free. Instead of that, we can just let the pppol2tp socket hold a reference on the session as soon as it starts depending on it (that is, in pppol2tp_connect()). Then we don't need to utilise pppol2tp_session_close() to hold a reference at the last moment to prevent l2tp_core from dropping it. When releasing the socket, pppol2tp_release() now deletes the session using the standard l2tp_session_delete() function, instead of merely removing it from hash tables. l2tp_session_delete() drops the reference the sessions holds on itself, but also makes sure it doesn't remove a session twice. So it can safely be called, even if l2tp_core already tried, or is concurrently trying, to remove the session. Finally, pppol2tp_session_destruct() drops the reference held by the socket. Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 69 +++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index d40d5492c148..845aba543dce 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -449,9 +449,6 @@ static void pppol2tp_session_close(struct l2tp_session *session) inet_shutdown(sk->sk_socket, SEND_SHUTDOWN); sock_put(sk); } - - /* Don't let the session go away before our socket does */ - l2tp_session_inc_refcount(session); } /* Really kill the session socket. (Called from sock_put() if @@ -507,8 +504,7 @@ static int pppol2tp_release(struct socket *sock) if (session != NULL) { struct pppol2tp_session *ps; - __l2tp_session_unhash(session); - l2tp_session_queue_purge(session); + l2tp_session_delete(session); ps = l2tp_session_priv(session); mutex_lock(&ps->sk_lock); @@ -600,6 +596,35 @@ static void pppol2tp_show(struct seq_file *m, void *arg) } #endif +static void pppol2tp_session_init(struct l2tp_session *session) +{ + struct pppol2tp_session *ps; + struct dst_entry *dst; + + session->recv_skb = pppol2tp_recv; + session->session_close = pppol2tp_session_close; +#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) + session->show = pppol2tp_show; +#endif + + ps = l2tp_session_priv(session); + mutex_init(&ps->sk_lock); + ps->tunnel_sock = session->tunnel->sock; + ps->owner = current->pid; + + /* If PMTU discovery was enabled, use the MTU that was discovered */ + dst = sk_dst_get(session->tunnel->sock); + if (dst) { + u32 pmtu = dst_mtu(dst); + + if (pmtu) { + session->mtu = pmtu - PPPOL2TP_HEADER_OVERHEAD; + session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD; + } + dst_release(dst); + } +} + /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, @@ -611,7 +636,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, struct l2tp_session *session = NULL; struct l2tp_tunnel *tunnel; struct pppol2tp_session *ps; - struct dst_entry *dst; struct l2tp_session_cfg cfg = { 0, }; int error = 0; u32 tunnel_id, peer_tunnel_id; @@ -753,8 +777,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } + pppol2tp_session_init(session); ps = l2tp_session_priv(session); - mutex_init(&ps->sk_lock); l2tp_session_inc_refcount(session); mutex_lock(&ps->sk_lock); @@ -767,26 +791,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, drop_refcnt = true; } - ps->owner = current->pid; - ps->tunnel_sock = tunnel->sock; - - session->recv_skb = pppol2tp_recv; - session->session_close = pppol2tp_session_close; -#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) - session->show = pppol2tp_show; -#endif - - /* If PMTU discovery was enabled, use the MTU that was discovered */ - dst = sk_dst_get(tunnel->sock); - if (dst != NULL) { - u32 pmtu = dst_mtu(dst); - - if (pmtu != 0) - session->mtu = session->mru = pmtu - - PPPOL2TP_HEADER_OVERHEAD; - dst_release(dst); - } - /* Special case: if source & dest session_id == 0x0000, this * socket is being created to manage the tunnel. Just set up * the internal context for use by ioctl() and sockopt() @@ -820,6 +824,12 @@ out_no_ppp: rcu_assign_pointer(ps->sk, sk); mutex_unlock(&ps->sk_lock); + /* Keep the reference we've grabbed on the session: sk doesn't expect + * the session to disappear. pppol2tp_session_destruct() is responsible + * for dropping it. + */ + drop_refcnt = false; + sk->sk_state = PPPOX_CONNECTED; l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n", session->name); @@ -841,7 +851,6 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, { int error; struct l2tp_session *session; - struct pppol2tp_session *ps; /* Error if tunnel socket is not prepped */ if (!tunnel->sock) { @@ -864,9 +873,7 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, goto err; } - ps = l2tp_session_priv(session); - mutex_init(&ps->sk_lock); - ps->tunnel_sock = tunnel->sock; + pppol2tp_session_init(session); error = l2tp_session_register(session, tunnel); if (error < 0) -- cgit v1.2.3 From 2ea2352ede9d97585164a7e19224955f4e4ca8db Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 27 Oct 2017 17:30:12 -0700 Subject: ipv6: prevent user from adding cached routes Cached routes should only be created by the system when receiving pmtu discovery or ip redirect msg. Users should not be allowed to create cached routes. Furthermore, after the patch series to move cached routes into exception table, user added cached routes will trigger the following warning in fib6_add(): WARNING: CPU: 0 PID: 2985 at net/ipv6/ip6_fib.c:1137 fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 2985 Comm: syzkaller320388 Not tainted 4.14.0-rc3+ #74 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 RSP: 0018:ffff8801cf09f6a0 EFLAGS: 00010297 RAX: ffff8801ce45e340 RBX: 1ffff10039e13eec RCX: ffff8801d749c814 RDX: 0000000000000000 RSI: ffff8801d749c700 RDI: ffff8801d749c780 RBP: ffff8801cf09fa08 R08: 0000000000000000 R09: ffff8801cf09f360 R10: ffff8801cf09f2d8 R11: 1ffff10039c8befb R12: 0000000000000001 R13: dffffc0000000000 R14: ffff8801d749c700 R15: ffffffff860655c0 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1011 ip6_route_add+0x148/0x1a0 net/ipv6/route.c:2782 ipv6_route_ioctl+0x4d5/0x690 net/ipv6/route.c:3291 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:521 sock_do_ioctl+0x65/0xb0 net/socket.c:961 sock_ioctl+0x2c2/0x440 net/socket.c:1058 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe So we fix this by failing the attemp to add cached routes from userspace with returning EINVAL error. Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 605e5dc1c010..70d9659fc1e9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2478,6 +2478,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } + /* RTF_CACHE is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_CACHE) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); + goto out; + } + if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; -- cgit v1.2.3 From f17d858ed0a48270db4368d8cf370e3839ee6f4f Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Wed, 25 Oct 2017 10:58:48 +0530 Subject: Bluetooth: Fix potential memory leak If command is added to req then it should be freed in case if hdev is down or HCI_ADVERTISING flag is set. This introduces a helper in hci_request to purge the cmd_q to make cmd_q internal to hci_request which is used to fix the leak. This also replace accessing of cmd_q in hci_conn with the new helper. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 2 +- net/bluetooth/hci_request.c | 5 +++++ net/bluetooth/hci_request.h | 1 + net/bluetooth/mgmt.c | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index dc59eae54717..746adcb62259 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -907,7 +907,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->le_scan_type == LE_SCAN_ACTIVE) { - skb_queue_purge(&req.cmd_q); + hci_req_purge(&req); hci_conn_del(conn); return ERR_PTR(-EBUSY); } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index b73ac149de34..7f28d17dc792 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -41,6 +41,11 @@ void hci_req_init(struct hci_request *req, struct hci_dev *hdev) req->err = 0; } +void hci_req_purge(struct hci_request *req) +{ + skb_queue_purge(&req->cmd_q); +} + static int req_run(struct hci_request *req, hci_req_complete_t complete, hci_req_complete_skb_t complete_skb) { diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index dde77bd59f91..702beb140d9f 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -36,6 +36,7 @@ struct hci_request { }; void hci_req_init(struct hci_request *req, struct hci_dev *hdev); +void hci_req_purge(struct hci_request *req); int hci_req_run(struct hci_request *req, hci_req_complete_t complete); int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete); void hci_req_add(struct hci_request *req, u16 opcode, u32 plen, diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1fba2a03f8ae..07a3cc29f426 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6383,6 +6383,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, if (skb_queue_empty(&req.cmd_q) || !hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) { + hci_req_purge(&req); rp.instance = cp->instance; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, -- cgit v1.2.3 From a9ee77af751f435675054f5a7e2d2e69cbfe9e33 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 21 Sep 2017 15:51:23 +0200 Subject: Bluetooth: avoid recursive locking in hci_send_to_channel() Mart reported a deadlock in -RT in the call path: hci_send_monitor_ctrl_event() -> hci_send_to_channel() because both functions acquire the same read lock hci_sk_list.lock. This is also a mainline issue because the qrwlock implementation is writer fair (the traditional rwlock implementation is reader biased). To avoid the deadlock there is now __hci_send_to_channel() which expects the readlock to be held. Fixes: 38ceaa00d02d ("Bluetooth: Add support for sending MGMT commands and events to monitor") Reported-by: Mart van de Wege Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sock.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 65d734c165bd..923e9a271872 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -251,15 +251,13 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) } /* Send frame to sockets with specific channel */ -void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, - int flag, struct sock *skip_sk) +static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb, + int flag, struct sock *skip_sk) { struct sock *sk; BT_DBG("channel %u len %d", channel, skb->len); - read_lock(&hci_sk_list.lock); - sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; @@ -285,6 +283,13 @@ void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, kfree_skb(nskb); } +} + +void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, + int flag, struct sock *skip_sk) +{ + read_lock(&hci_sk_list.lock); + __hci_send_to_channel(channel, skb, flag, skip_sk); read_unlock(&hci_sk_list.lock); } @@ -388,8 +393,8 @@ void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, hdr->index = index; hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); - hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, - HCI_SOCK_TRUSTED, NULL); + __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } -- cgit v1.2.3 From 2064ee332e4c1b7495cf68b84355c213d8fe71fd Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 30 Oct 2017 10:42:59 +0100 Subject: Bluetooth: Use bt_dev_err and bt_dev_info when possible In case of using BT_ERR and BT_INFO, convert to bt_dev_err and bt_dev_info when possible. This allows for controller specific reporting. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/a2mp.c | 2 +- net/bluetooth/amp.c | 4 ++-- net/bluetooth/hci_conn.c | 4 ++-- net/bluetooth/hci_core.c | 35 ++++++++++++++-------------- net/bluetooth/hci_event.c | 46 +++++++++++++++++++------------------ net/bluetooth/hci_request.c | 16 +++++++------ net/bluetooth/hci_sysfs.c | 2 +- net/bluetooth/mgmt.c | 56 ++++++++++++++++++++++++--------------------- net/bluetooth/smp.c | 22 +++++++++--------- 9 files changed, 97 insertions(+), 90 deletions(-) (limited to 'net') diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index aad994edd3bb..51c2cf2d8923 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -573,7 +573,7 @@ static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK, &mgr->l2cap_conn->hcon->dst); if (!hcon) { - BT_ERR("No phys link exist"); + bt_dev_err(hdev, "no phys link exist"); rsp.status = A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS; goto clean; } diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index ebcab5bbadd7..78bec8df8525 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -187,7 +187,7 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type) /* Legacy key */ if (conn->key_type < 3) { - BT_ERR("Legacy key type %d", conn->key_type); + bt_dev_err(hdev, "legacy key type %d", conn->key_type); return -EACCES; } @@ -207,7 +207,7 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type) /* Derive Generic AMP Link Key (gamp) */ err = hmac_sha256(keybuf, HCI_AMP_LINK_KEY_SIZE, "gamp", 4, gamp_key); if (err) { - BT_ERR("Could not derive Generic AMP Key: err %d", err); + bt_dev_err(hdev, "could not derive Generic AMP Key: err %d", err); return err; } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 746adcb62259..a9682534c377 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -729,8 +729,8 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) goto done; } - BT_ERR("HCI request failed to create LE connection: status 0x%2.2x", - status); + bt_dev_err(hdev, "request failed to create LE connection: " + "status 0x%2.2x", status); if (!conn) goto done; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 6bc679cd3481..40d260f2bea5 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -267,7 +267,7 @@ static int hci_init1_req(struct hci_request *req, unsigned long opt) amp_init1(req); break; default: - BT_ERR("Unknown device type %d", hdev->dev_type); + bt_dev_err(hdev, "Unknown device type %d", hdev->dev_type); break; } @@ -2150,8 +2150,7 @@ static void hci_error_reset(struct work_struct *work) if (hdev->hw_error) hdev->hw_error(hdev, hdev->hw_error_code); else - BT_ERR("%s hardware error 0x%2.2x", hdev->name, - hdev->hw_error_code); + bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code); if (hci_dev_do_close(hdev)) return; @@ -2524,9 +2523,9 @@ static void hci_cmd_timeout(struct work_struct *work) struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data; u16 opcode = __le16_to_cpu(sent->opcode); - BT_ERR("%s command 0x%4.4x tx timeout", hdev->name, opcode); + bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode); } else { - BT_ERR("%s command tx timeout", hdev->name); + bt_dev_err(hdev, "command tx timeout"); } atomic_set(&hdev->cmd_cnt, 1); @@ -2858,7 +2857,7 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) { - BT_ERR("Out of memory"); + bt_dev_err(hdev, "out of memory"); return NULL; } @@ -3393,7 +3392,7 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) err = hdev->send(hdev, skb); if (err < 0) { - BT_ERR("%s sending frame failed (%d)", hdev->name, err); + bt_dev_err(hdev, "sending frame failed (%d)", err); kfree_skb(skb); } } @@ -3408,7 +3407,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, skb = hci_prepare_cmd(hdev, opcode, plen, param); if (!skb) { - BT_ERR("%s no memory for command", hdev->name); + bt_dev_err(hdev, "no memory for command"); return -ENOMEM; } @@ -3493,7 +3492,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, hci_add_acl_hdr(skb, chan->handle, flags); break; default: - BT_ERR("%s unknown dev_type %d", hdev->name, hdev->dev_type); + bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type); return; } @@ -3618,7 +3617,7 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, break; default: cnt = 0; - BT_ERR("Unknown link type"); + bt_dev_err(hdev, "unknown link type %d", conn->type); } q = cnt / num; @@ -3635,15 +3634,15 @@ static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; - BT_ERR("%s link tx timeout", hdev->name); + bt_dev_err(hdev, "link tx timeout"); rcu_read_lock(); /* Kill stalled connections */ list_for_each_entry_rcu(c, &h->list, list) { if (c->type == type && c->sent) { - BT_ERR("%s killing stalled connection %pMR", - hdev->name, &c->dst); + bt_dev_err(hdev, "killing stalled connection %pMR", + &c->dst); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); } } @@ -3724,7 +3723,7 @@ static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type, break; default: cnt = 0; - BT_ERR("Unknown link type"); + bt_dev_err(hdev, "unknown link type %d", chan->conn->type); } q = cnt / num; @@ -4066,8 +4065,8 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) l2cap_recv_acldata(conn, skb, flags); return; } else { - BT_ERR("%s ACL packet for unknown connection handle %d", - hdev->name, handle); + bt_dev_err(hdev, "ACL packet for unknown connection handle %d", + handle); } kfree_skb(skb); @@ -4097,8 +4096,8 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) sco_recv_scodata(conn, skb); return; } else { - BT_ERR("%s SCO packet for unknown connection handle %d", - hdev->name, handle); + bt_dev_err(hdev, "SCO packet for unknown connection handle %d", + handle); } kfree_skb(skb); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0b4dba08a14e..cd3bbb766c24 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1188,7 +1188,8 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, break; default: - BT_ERR("Used reserved LE_Scan_Enable param %d", cp->enable); + bt_dev_err(hdev, "use of reserved LE_Scan_Enable param %d", + cp->enable); break; } @@ -1485,7 +1486,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) conn = hci_conn_add(hdev, ACL_LINK, &cp->bdaddr, HCI_ROLE_MASTER); if (!conn) - BT_ERR("No memory for new connection"); + bt_dev_err(hdev, "no memory for new connection"); } } @@ -2269,7 +2270,7 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr, HCI_ROLE_SLAVE); if (!conn) { - BT_ERR("No memory for new connection"); + bt_dev_err(hdev, "no memory for new connection"); hci_dev_unlock(hdev); return; } @@ -2431,7 +2432,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!hci_conn_ssp_enabled(conn) && test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) { - BT_INFO("re-auth of legacy device is not possible."); + bt_dev_info(hdev, "re-auth of legacy device is not possible."); } else { set_bit(HCI_CONN_AUTH, &conn->flags); conn->sec_level = conn->pending_sec_level; @@ -2535,8 +2536,7 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, BT_DBG("%s status 0x%02x", hdev->name, status); if (!skb || skb->len < sizeof(*rp)) { - BT_ERR("%s invalid HCI Read Encryption Key Size response", - hdev->name); + bt_dev_err(hdev, "invalid read key size response"); return; } @@ -2554,8 +2554,8 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, * supported. */ if (rp->status) { - BT_ERR("%s failed to read key size for handle %u", hdev->name, - handle); + bt_dev_err(hdev, "failed to read key size for handle %u", + handle); conn->enc_key_size = HCI_LINK_KEY_SIZE; } else { conn->enc_key_size = rp->key_size; @@ -2664,7 +2664,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_req_add(&req, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp); if (hci_req_run_skb(&req, read_enc_key_size_complete)) { - BT_ERR("Sending HCI Read Encryption Key Size failed"); + bt_dev_err(hdev, "sending read key size failed"); conn->enc_key_size = HCI_LINK_KEY_SIZE; goto notify; } @@ -3197,7 +3197,7 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb) int i; if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_PACKET_BASED) { - BT_ERR("Wrong event for mode %d", hdev->flow_ctl_mode); + bt_dev_err(hdev, "wrong event for mode %d", hdev->flow_ctl_mode); return; } @@ -3249,7 +3249,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb) break; default: - BT_ERR("Unknown type %d conn %p", conn->type, conn); + bt_dev_err(hdev, "unknown type %d conn %p", + conn->type, conn); break; } } @@ -3271,7 +3272,7 @@ static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev, return chan->conn; break; default: - BT_ERR("%s unknown dev_type %d", hdev->name, hdev->dev_type); + bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type); break; } @@ -3284,7 +3285,7 @@ static void hci_num_comp_blocks_evt(struct hci_dev *hdev, struct sk_buff *skb) int i; if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_BLOCK_BASED) { - BT_ERR("Wrong event for mode %d", hdev->flow_ctl_mode); + bt_dev_err(hdev, "wrong event for mode %d", hdev->flow_ctl_mode); return; } @@ -3320,7 +3321,8 @@ static void hci_num_comp_blocks_evt(struct hci_dev *hdev, struct sk_buff *skb) break; default: - BT_ERR("Unknown type %d conn %p", conn->type, conn); + bt_dev_err(hdev, "unknown type %d conn %p", + conn->type, conn); break; } } @@ -4479,7 +4481,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!conn) { conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role); if (!conn) { - BT_ERR("No memory for new connection"); + bt_dev_err(hdev, "no memory for new connection"); goto unlock; } @@ -4749,8 +4751,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, case LE_ADV_SCAN_RSP: break; default: - BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", - type); + bt_dev_err_ratelimited(hdev, "unknown advertising packet " + "type: 0x%02x", type); return; } @@ -4769,8 +4771,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, /* Adjust for actual length */ if (len != real_len) { - BT_ERR_RATELIMITED("%s advertising data length corrected", - hdev->name); + bt_dev_err_ratelimited(hdev, "advertising data len corrected"); len = real_len; } @@ -5192,7 +5193,7 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, return false; if (skb->len < sizeof(*hdr)) { - BT_ERR("Too short HCI event"); + bt_dev_err(hdev, "too short HCI event"); return false; } @@ -5206,12 +5207,13 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, } if (hdr->evt != HCI_EV_CMD_COMPLETE) { - BT_DBG("Last event is not cmd complete (0x%2.2x)", hdr->evt); + bt_dev_err(hdev, "last event is not cmd complete (0x%2.2x)", + hdr->evt); return false; } if (skb->len < sizeof(*ev)) { - BT_ERR("Too short cmd_complete event"); + bt_dev_err(hdev, "too short cmd_complete event"); return false; } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 7f28d17dc792..abc0f3224dd1 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -336,8 +336,8 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen, skb = hci_prepare_cmd(hdev, opcode, plen, param); if (!skb) { - BT_ERR("%s no memory for command (opcode 0x%4.4x)", - hdev->name, opcode); + bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", + opcode); req->err = -ENOMEM; return; } @@ -1426,7 +1426,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { - BT_ERR("%s failed to generate new RPA", hdev->name); + bt_dev_err(hdev, "failed to generate new RPA"); return err; } @@ -1788,7 +1788,7 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) err = hci_req_run(&req, abort_conn_complete); if (err && err != -ENODATA) { - BT_ERR("Failed to run HCI request: err %d", err); + bt_dev_err(conn->hdev, "failed to run HCI request: err %d", err); return err; } @@ -1872,7 +1872,8 @@ static void le_scan_disable_work(struct work_struct *work) hci_req_sync(hdev, le_scan_disable, 0, HCI_CMD_TIMEOUT, &status); if (status) { - BT_ERR("Failed to disable LE scan: status 0x%02x", status); + bt_dev_err(hdev, "failed to disable LE scan: status 0x%02x", + status); return; } @@ -1903,7 +1904,7 @@ static void le_scan_disable_work(struct work_struct *work) hci_req_sync(hdev, bredr_inquiry, DISCOV_INTERLEAVED_INQUIRY_LEN, HCI_CMD_TIMEOUT, &status); if (status) { - BT_ERR("Inquiry failed: status 0x%02x", status); + bt_dev_err(hdev, "inquiry failed: status 0x%02x", status); goto discov_stopped; } @@ -1945,7 +1946,8 @@ static void le_scan_restart_work(struct work_struct *work) hci_req_sync(hdev, le_scan_restart, 0, HCI_CMD_TIMEOUT, &status); if (status) { - BT_ERR("Failed to restart LE scan: status %d", status); + bt_dev_err(hdev, "failed to restart LE scan: status %d", + status); return; } diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index aa300f3a0d51..34aaa2340ac8 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -50,7 +50,7 @@ void hci_conn_add_sysfs(struct hci_conn *conn) dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle); if (device_add(&conn->dev) < 0) { - BT_ERR("Failed to register connection device"); + bt_dev_err(hdev, "failed to register connection device"); return; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 07a3cc29f426..6e9fc86d8daf 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2159,8 +2159,8 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, key_count = __le16_to_cpu(cp->key_count); if (key_count > max_key_count) { - BT_ERR("load_link_keys: too big key_count value %u", - key_count); + bt_dev_err(hdev, "load_link_keys: too big key_count value %u", + key_count); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); } @@ -2168,8 +2168,8 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, expected_len = sizeof(*cp) + key_count * sizeof(struct mgmt_link_key_info); if (expected_len != len) { - BT_ERR("load_link_keys: expected %u bytes, got %u bytes", - expected_len, len); + bt_dev_err(hdev, "load_link_keys: expected %u bytes, got %u bytes", + expected_len, len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); } @@ -2561,7 +2561,7 @@ static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data, memcpy(&ncp.addr, &cp->addr, sizeof(ncp.addr)); - BT_ERR("PIN code is not 16 bytes long"); + bt_dev_err(hdev, "PIN code is not 16 bytes long"); err = send_pin_code_neg_reply(sk, hdev, &ncp); if (err >= 0) @@ -3391,7 +3391,8 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, MGMT_OP_ADD_REMOTE_OOB_DATA, status, &cp->addr, sizeof(cp->addr)); } else { - BT_ERR("add_remote_oob_data: invalid length of %u bytes", len); + bt_dev_err(hdev, "add_remote_oob_data: invalid len of %u bytes", + len); err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, MGMT_STATUS_INVALID_PARAMS); } @@ -3604,8 +3605,8 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, uuid_count = __le16_to_cpu(cp->uuid_count); if (uuid_count > max_uuid_count) { - BT_ERR("service_discovery: too big uuid_count value %u", - uuid_count); + bt_dev_err(hdev, "service_discovery: too big uuid_count value %u", + uuid_count); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_SERVICE_DISCOVERY, MGMT_STATUS_INVALID_PARAMS, &cp->type, @@ -3615,8 +3616,8 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, expected_len = sizeof(*cp) + uuid_count * 16; if (expected_len != len) { - BT_ERR("service_discovery: expected %u bytes, got %u bytes", - expected_len, len); + bt_dev_err(hdev, "service_discovery: expected %u bytes, got %u bytes", + expected_len, len); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_SERVICE_DISCOVERY, MGMT_STATUS_INVALID_PARAMS, &cp->type, @@ -3943,7 +3944,7 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, err = hci_req_run(&req, enable_advertising_instance); if (err) - BT_ERR("Failed to re-configure advertising"); + bt_dev_err(hdev, "failed to re-configure advertising"); unlock: hci_dev_unlock(hdev); @@ -4664,15 +4665,16 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, irk_count = __le16_to_cpu(cp->irk_count); if (irk_count > max_irk_count) { - BT_ERR("load_irks: too big irk_count value %u", irk_count); + bt_dev_err(hdev, "load_irks: too big irk_count value %u", + irk_count); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, MGMT_STATUS_INVALID_PARAMS); } expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info); if (expected_len != len) { - BT_ERR("load_irks: expected %u bytes, got %u bytes", - expected_len, len); + bt_dev_err(hdev, "load_irks: expected %u bytes, got %u bytes", + expected_len, len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, MGMT_STATUS_INVALID_PARAMS); } @@ -4745,7 +4747,8 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, key_count = __le16_to_cpu(cp->key_count); if (key_count > max_key_count) { - BT_ERR("load_ltks: too big key_count value %u", key_count); + bt_dev_err(hdev, "load_ltks: too big key_count value %u", + key_count); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, MGMT_STATUS_INVALID_PARAMS); } @@ -4753,8 +4756,8 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, expected_len = sizeof(*cp) + key_count * sizeof(struct mgmt_ltk_info); if (expected_len != len) { - BT_ERR("load_keys: expected %u bytes, got %u bytes", - expected_len, len); + bt_dev_err(hdev, "load_keys: expected %u bytes, got %u bytes", + expected_len, len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, MGMT_STATUS_INVALID_PARAMS); } @@ -4873,14 +4876,15 @@ static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status, } if (!cp) { - BT_ERR("invalid sent_cmd in conn_info response"); + bt_dev_err(hdev, "invalid sent_cmd in conn_info response"); goto unlock; } handle = __le16_to_cpu(cp->handle); conn = hci_conn_hash_lookup_handle(hdev, handle); if (!conn) { - BT_ERR("unknown handle (%d) in conn_info response", handle); + bt_dev_err(hdev, "unknown handle (%d) in conn_info response", + handle); goto unlock; } @@ -5477,8 +5481,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, param_count = __le16_to_cpu(cp->param_count); if (param_count > max_param_count) { - BT_ERR("load_conn_param: too big param_count value %u", - param_count); + bt_dev_err(hdev, "load_conn_param: too big param_count value %u", + param_count); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, MGMT_STATUS_INVALID_PARAMS); } @@ -5486,8 +5490,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, expected_len = sizeof(*cp) + param_count * sizeof(struct mgmt_conn_param); if (expected_len != len) { - BT_ERR("load_conn_param: expected %u bytes, got %u bytes", - expected_len, len); + bt_dev_err(hdev, "load_conn_param: expected %u bytes, got %u bytes", + expected_len, len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, MGMT_STATUS_INVALID_PARAMS); } @@ -5512,7 +5516,7 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, } else if (param->addr.type == BDADDR_LE_RANDOM) { addr_type = ADDR_LE_DEV_RANDOM; } else { - BT_ERR("Ignoring invalid connection parameters"); + bt_dev_err(hdev, "ignoring invalid connection parameters"); continue; } @@ -5525,14 +5529,14 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, min, max, latency, timeout); if (hci_check_conn_params(min, max, latency, timeout) < 0) { - BT_ERR("Ignoring invalid connection parameters"); + bt_dev_err(hdev, "ignoring invalid connection parameters"); continue; } hci_param = hci_conn_params_add(hdev, ¶m->addr.bdaddr, addr_type); if (!hci_param) { - BT_ERR("Failed to add connection parameters"); + bt_dev_err(hdev, "failed to add connection parameters"); continue; } diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index d41449b9e9d6..01117ae84f1d 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -996,7 +996,8 @@ static u8 smp_random(struct smp_chan *smp) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, confirm, sizeof(smp->pcnf))) { - BT_ERR("Pairing failed (confirmation values mismatch)"); + bt_dev_err(hcon->hdev, "pairing failed " + "(confirmation values mismatch)"); return SMP_CONFIRM_FAILED; } @@ -1210,7 +1211,7 @@ static void sc_generate_ltk(struct smp_chan *smp) key = hci_find_link_key(hdev, &hcon->dst); if (!key) { - BT_ERR("%s No Link Key found to generate LTK", hdev->name); + bt_dev_err(hdev, "no Link Key found to generate LTK"); return; } @@ -2067,11 +2068,11 @@ static int fixup_sc_false_positive(struct smp_chan *smp) return SMP_UNSPECIFIED; if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { - BT_ERR("Refusing SMP SC -> legacy fallback in SC-only mode"); + bt_dev_err(hdev, "refusing legacy fallback in SC-only mode"); return SMP_UNSPECIFIED; } - BT_ERR("Trying to fall back to legacy SMP"); + bt_dev_err(hdev, "trying to fall back to legacy SMP"); req = (void *) &smp->preq[1]; rsp = (void *) &smp->prsp[1]; @@ -2082,7 +2083,7 @@ static int fixup_sc_false_positive(struct smp_chan *smp) auth = req->auth_req & AUTH_REQ_MASK(hdev); if (tk_request(conn, 0, auth, rsp->io_capability, req->io_capability)) { - BT_ERR("Failed to fall back to legacy SMP"); + bt_dev_err(hdev, "failed to fall back to legacy SMP"); return SMP_UNSPECIFIED; } @@ -2355,7 +2356,7 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) chan = conn->smp; if (!chan) { - BT_ERR("SMP security requested but not available"); + bt_dev_err(hcon->hdev, "security requested but not available"); return 1; } @@ -2548,7 +2549,7 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, */ if (!bacmp(&info->bdaddr, BDADDR_ANY) || !hci_is_identity_address(&info->bdaddr, info->addr_type)) { - BT_ERR("Ignoring IRK with no identity address"); + bt_dev_err(hcon->hdev, "ignoring IRK with no identity address"); goto distribute; } @@ -2953,8 +2954,8 @@ done: return err; drop: - BT_ERR("%s unexpected SMP command 0x%02x from %pMR", hcon->hdev->name, - code, &hcon->dst); + bt_dev_err(hcon->hdev, "unexpected SMP command 0x%02x from %pMR", + code, &hcon->dst); kfree_skb(skb); return 0; } @@ -3021,8 +3022,7 @@ static void bredr_pairing(struct l2cap_chan *chan) smp = smp_chan_create(conn); if (!smp) { - BT_ERR("%s unable to create SMP context for BR/EDR", - hdev->name); + bt_dev_err(hdev, "unable to create SMP context for BR/EDR"); return; } -- cgit v1.2.3 From aa2bc739ef4a181a7589eb009be96a870cc1788f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Oct 2017 13:46:47 -0700 Subject: net: filter: remove unused variable and fix warning bpf_getsockopt bpf call sets the ret variable to zero and never changes it. What's worse in case CONFIG_INET is not selected the variable is completely unused generating a warning. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 721c30889033..a0112168d6f9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3288,7 +3288,6 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { struct sock *sk = bpf_sock->sk; - int ret = 0; if (!sk_fullsock(sk)) goto err_clear; @@ -3308,7 +3307,7 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } - return ret; + return 0; #endif err_clear: memset(optval, 0, optlen); -- cgit v1.2.3 From e4dca7b7aa08b22893c45485d222b5807c1375ae Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Oct 2017 19:04:42 -0700 Subject: treewide: Fix function prototypes for module_param_call() Several function prototypes for the set/get functions defined by module_param_call() have a slightly wrong argument types. This fixes those in an effort to clean up the calls when running under type-enforced compiler instrumentation for CFI. This is the result of running the following semantic patch: @match_module_param_call_function@ declarer name module_param_call; identifier _name, _set_func, _get_func; expression _arg, _mode; @@ module_param_call(_name, _set_func, _get_func, _arg, _mode); @fix_set_prototype depends on match_module_param_call_function@ identifier match_module_param_call_function._set_func; identifier _val, _param; type _val_type, _param_type; @@ int _set_func( -_val_type _val +const char * _val , -_param_type _param +const struct kernel_param * _param ) { ... } @fix_get_prototype depends on match_module_param_call_function@ identifier match_module_param_call_function._get_func; identifier _val, _param; type _val_type, _param_type; @@ int _get_func( -_val_type _val +char * _val , -_param_type _param +const struct kernel_param * _param ) { ... } Two additional by-hand changes are included for places where the above Coccinelle script didn't notice them: drivers/platform/x86/thinkpad_acpi.c fs/lockd/svc.c Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_nat_ftp.c | 2 +- net/netfilter/nf_nat_irc.c | 2 +- net/sunrpc/svc.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 01130392b7c0..5e50c54e1318 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1940,7 +1940,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize) return 0; } -int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) { unsigned int hashsize; int rc; diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index e84a578dbe35..d76afafdc699 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -134,7 +134,7 @@ static int __init nf_nat_ftp_init(void) } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) +static int warn_set(const char *val, const struct kernel_param *kp) { printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index 0648cb096bd8..dcb5f6375d9d 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -106,7 +106,7 @@ static int __init nf_nat_irc_init(void) } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) +static int warn_set(const char *val, const struct kernel_param *kp) { printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index aa04666f929d..e5e4e18699c8 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -50,7 +50,7 @@ EXPORT_SYMBOL_GPL(svc_pool_map); static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int -param_set_pool_mode(const char *val, struct kernel_param *kp) +param_set_pool_mode(const char *val, const struct kernel_param *kp) { int *ip = (int *)kp->arg; struct svc_pool_map *m = &svc_pool_map; @@ -80,7 +80,7 @@ out: } static int -param_get_pool_mode(char *buf, struct kernel_param *kp) +param_get_pool_mode(char *buf, const struct kernel_param *kp) { int *ip = (int *)kp->arg; -- cgit v1.2.3 From 384c181e3780ddc45e70483e29d84495b484730d Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 27 Oct 2017 02:35:34 -0700 Subject: net: sched: Identify hardware traffic classes using classid This patch offloads the classid to hardware and uses the classid reserved in the range :ffe0 - :ffef to identify hardware traffic classes reported via dev->num_tc. tcf_result structure contains the class ID of the class to which the packet belongs and is offloaded to hardware via flower filter. A new helper function is introduced to represent HW traffic classes 0 through 15 using the reserved classid values :ffe0 - :ffef. Signed-off-by: Amritha Nambiar Acked-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- net/sched/cls_flower.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 35cb6d684e44..c99fa9e5be46 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -241,6 +241,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.mask = mask; cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; + cls_flower.classid = f->res.classid; err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw); @@ -266,6 +267,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.exts = &f->exts; + cls_flower.classid = f->res.classid; tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, false); -- cgit v1.2.3 From a4346210c4e092de50594b728300766121a3b00d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 31 Oct 2017 17:36:42 +0100 Subject: l2tp: remove ->ref() and ->deref() The ->ref() and ->deref() callbacks are unused since PPP stopped using them in ee40fb2e1eb5 ("l2tp: protect sock pointer of struct pppol2tp_session with RCU"). We can thus remove them from struct l2tp_session and drop the do_ref parameter of l2tp_session_get*(). Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 52 ++++++++----------------------------------------- net/l2tp/l2tp_core.h | 10 +++------- net/l2tp/l2tp_debugfs.c | 4 +--- net/l2tp/l2tp_ip.c | 4 +--- net/l2tp/l2tp_ip6.c | 4 +--- net/l2tp/l2tp_netlink.c | 20 ++++++++----------- net/l2tp/l2tp_ppp.c | 10 +++------- 7 files changed, 25 insertions(+), 79 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index a1d56e143fcd..216f49aec16f 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -216,12 +216,10 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) } EXPORT_SYMBOL_GPL(l2tp_tunnel_get); -/* Lookup a session. A new reference is held on the returned session. - * Optionally calls session->ref() too if do_ref is true. - */ +/* Lookup a session. A new reference is held on the returned session. */ struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_tunnel *tunnel, - u32 session_id, bool do_ref) + u32 session_id) { struct hlist_head *session_list; struct l2tp_session *session; @@ -235,8 +233,6 @@ struct l2tp_session *l2tp_session_get(const struct net *net, hlist_for_each_entry_rcu(session, session_list, global_hlist) { if (session->session_id == session_id) { l2tp_session_inc_refcount(session); - if (do_ref && session->ref) - session->ref(session); rcu_read_unlock_bh(); return session; @@ -252,8 +248,6 @@ struct l2tp_session *l2tp_session_get(const struct net *net, hlist_for_each_entry(session, session_list, hlist) { if (session->session_id == session_id) { l2tp_session_inc_refcount(session); - if (do_ref && session->ref) - session->ref(session); read_unlock_bh(&tunnel->hlist_lock); return session; @@ -265,8 +259,7 @@ struct l2tp_session *l2tp_session_get(const struct net *net, } EXPORT_SYMBOL_GPL(l2tp_session_get); -struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth, - bool do_ref) +struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth) { int hash; struct l2tp_session *session; @@ -277,8 +270,6 @@ struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth, hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) { if (++count > nth) { l2tp_session_inc_refcount(session); - if (do_ref && session->ref) - session->ref(session); read_unlock_bh(&tunnel->hlist_lock); return session; } @@ -295,8 +286,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_get_nth); * This is very inefficient but is only used by management interfaces. */ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, - const char *ifname, - bool do_ref) + const char *ifname) { struct l2tp_net *pn = l2tp_pernet(net); int hash; @@ -307,8 +297,6 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) { if (!strcmp(session->ifname, ifname)) { l2tp_session_inc_refcount(session); - if (do_ref && session->ref) - session->ref(session); rcu_read_unlock_bh(); return session; @@ -489,9 +477,6 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff * (*session->recv_skb)(session, skb, L2TP_SKB_CB(skb)->length); else kfree_skb(skb); - - if (session->deref) - (*session->deref)(session); } /* Dequeue skbs from the session's reorder_q, subject to packet order. @@ -520,8 +505,6 @@ start: session->reorder_skip = 1; __skb_unlink(skb, &session->reorder_q); kfree_skb(skb); - if (session->deref) - (*session->deref)(session); continue; } @@ -694,9 +677,6 @@ discard: * a data (not control) frame before coming here. Fields up to the * session-id have already been parsed and ptr points to the data * after the session-id. - * - * session->ref() must have been called prior to l2tp_recv_common(). - * session->deref() will be called automatically after skb is processed. */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, @@ -863,9 +843,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, discard: atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); - - if (session->deref) - (*session->deref)(session); } EXPORT_SYMBOL(l2tp_recv_common); @@ -879,8 +856,6 @@ int l2tp_session_queue_purge(struct l2tp_session *session) while ((skb = skb_dequeue(&session->reorder_q))) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); - if (session->deref) - (*session->deref)(session); } return 0; } @@ -972,13 +947,10 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, } /* Find the session context */ - session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id, true); + session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id); if (!session || !session->recv_skb) { - if (session) { - if (session->deref) - session->deref(session); + if (session) l2tp_session_dec_refcount(session); - } /* Not found? Pass to userspace to deal with */ l2tp_info(tunnel, L2TP_MSG_DATA, @@ -1322,9 +1294,6 @@ again: if (test_and_set_bit(0, &session->dead)) goto again; - if (session->ref != NULL) - (*session->ref)(session); - write_unlock_bh(&tunnel->hlist_lock); __l2tp_session_unhash(session); @@ -1333,9 +1302,6 @@ again: if (session->session_close != NULL) (*session->session_close)(session); - if (session->deref != NULL) - (*session->deref)(session); - l2tp_session_dec_refcount(session); write_lock_bh(&tunnel->hlist_lock); @@ -1759,15 +1725,13 @@ int l2tp_session_delete(struct l2tp_session *session) if (test_and_set_bit(0, &session->dead)) return 0; - if (session->ref) - (*session->ref)(session); __l2tp_session_unhash(session); l2tp_session_queue_purge(session); if (session->session_close != NULL) (*session->session_close)(session); - if (session->deref) - (*session->deref)(session); + l2tp_session_dec_refcount(session); + return 0; } EXPORT_SYMBOL_GPL(l2tp_session_delete); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 77caa5966736..92c90e72259c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -129,8 +129,6 @@ struct l2tp_session { int (*build_header)(struct l2tp_session *session, void *buf); void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); void (*session_close)(struct l2tp_session *session); - void (*ref)(struct l2tp_session *session); - void (*deref)(struct l2tp_session *session); #if IS_ENABLED(CONFIG_L2TP_DEBUGFS) void (*show)(struct seq_file *m, void *priv); #endif @@ -245,12 +243,10 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_tunnel *tunnel, - u32 session_id, bool do_ref); -struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth, - bool do_ref); + u32 session_id); +struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, - const char *ifname, - bool do_ref); + const char *ifname); struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth); diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 53bae54c4d6e..eb69411bcb47 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -53,7 +53,7 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd) static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd) { - pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true); + pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); pd->session_idx++; if (pd->session == NULL) { @@ -241,8 +241,6 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v) l2tp_dfs_seq_tunnel_show(m, pd->tunnel); } else { l2tp_dfs_seq_session_show(m, pd->session); - if (pd->session->deref) - pd->session->deref(pd->session); l2tp_session_dec_refcount(pd->session); } diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 4d322c1b7233..6dbe450400a2 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -143,7 +143,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_get(net, NULL, session_id, true); + session = l2tp_session_get(net, NULL, session_id); if (!session) goto discard; @@ -205,8 +205,6 @@ pass_up: return sk_receive_skb(sk, skb, 1); discard_sess: - if (session->deref) - session->deref(session); l2tp_session_dec_refcount(session); goto discard; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 88b397c30d86..59ebb6e4f735 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -156,7 +156,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_get(net, NULL, session_id, true); + session = l2tp_session_get(net, NULL, session_id); if (!session) goto discard; @@ -219,8 +219,6 @@ pass_up: return sk_receive_skb(sk, skb, 1); discard_sess: - if (session->deref) - session->deref(session); l2tp_session_dec_refcount(session); goto discard; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index f04fb347d251..a1f24fb2be98 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -48,8 +48,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, /* Accessed under genl lock */ static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX]; -static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info, - bool do_ref) +static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info) { u32 tunnel_id; u32 session_id; @@ -60,15 +59,14 @@ static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info, if (info->attrs[L2TP_ATTR_IFNAME]) { ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); - session = l2tp_session_get_by_ifname(net, ifname, do_ref); + session = l2tp_session_get_by_ifname(net, ifname); } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) && (info->attrs[L2TP_ATTR_CONN_ID])) { tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (tunnel) { - session = l2tp_session_get(net, tunnel, session_id, - do_ref); + session = l2tp_session_get(net, tunnel, session_id); l2tp_tunnel_dec_refcount(tunnel); } } @@ -649,7 +647,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf &cfg); if (ret >= 0) { - session = l2tp_session_get(net, tunnel, session_id, false); + session = l2tp_session_get(net, tunnel, session_id); if (session) { ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_CREATE); @@ -669,7 +667,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf struct l2tp_session *session; u16 pw_type; - session = l2tp_nl_session_get(info, true); + session = l2tp_nl_session_get(info); if (session == NULL) { ret = -ENODEV; goto out; @@ -683,8 +681,6 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session); - if (session->deref) - session->deref(session); l2tp_session_dec_refcount(session); out: @@ -696,7 +692,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf int ret = 0; struct l2tp_session *session; - session = l2tp_nl_session_get(info, false); + session = l2tp_nl_session_get(info); if (session == NULL) { ret = -ENODEV; goto out; @@ -828,7 +824,7 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; int ret; - session = l2tp_nl_session_get(info, false); + session = l2tp_nl_session_get(info); if (session == NULL) { ret = -ENODEV; goto err; @@ -874,7 +870,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback goto out; } - session = l2tp_session_get_nth(tunnel, si, false); + session = l2tp_session_get_nth(tunnel, si); if (session == NULL) { ti++; tunnel = NULL; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 845aba543dce..535db8319769 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -742,7 +742,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (tunnel->peer_tunnel_id == 0) tunnel->peer_tunnel_id = peer_tunnel_id; - session = l2tp_session_get(sock_net(sk), tunnel, session_id, false); + session = l2tp_session_get(sock_net(sk), tunnel, session_id); if (session) { drop_refcnt = true; ps = l2tp_session_priv(session); @@ -1193,13 +1193,11 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, /* resend to session ioctl handler */ struct l2tp_session *session = l2tp_session_get(sock_net(sk), tunnel, - stats.session_id, true); + stats.session_id); if (session) { err = pppol2tp_session_ioctl(session, cmd, arg); - if (session->deref) - session->deref(session); l2tp_session_dec_refcount(session); } else { err = -EBADR; @@ -1615,7 +1613,7 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd) static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) { - pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true); + pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); pd->session_idx++; if (pd->session == NULL) { @@ -1758,8 +1756,6 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v) pppol2tp_seq_tunnel_show(m, pd->tunnel); } else { pppol2tp_seq_session_show(m, pd->session); - if (pd->session->deref) - pd->session->deref(pd->session); l2tp_session_dec_refcount(pd->session); } -- cgit v1.2.3 From 9ff672ba4eefd0821b4ad696511d7b4e94b7e539 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 31 Oct 2017 17:36:44 +0100 Subject: l2tp: remove l2tp specific refcount debugging With conversion to refcount_t, such manual debugging code doesn't make sense anymore. The tunnel part was already dropped by 54652eb12c1b ("l2tp: hold tunnel while looking up sessions in l2tp_netlink"). Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.h | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 92c90e72259c..9534e16965cc 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -294,37 +294,17 @@ static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) /* Session reference counts. Incremented when code obtains a reference * to a session. */ -static inline void l2tp_session_inc_refcount_1(struct l2tp_session *session) +static inline void l2tp_session_inc_refcount(struct l2tp_session *session) { refcount_inc(&session->ref_count); } -static inline void l2tp_session_dec_refcount_1(struct l2tp_session *session) +static inline void l2tp_session_dec_refcount(struct l2tp_session *session) { if (refcount_dec_and_test(&session->ref_count)) l2tp_session_free(session); } -#ifdef L2TP_REFCNT_DEBUG -#define l2tp_session_inc_refcount(_s) \ -do { \ - pr_debug("l2tp_session_inc_refcount: %s:%d %s: cnt=%d\n", \ - __func__, __LINE__, (_s)->name, \ - refcount_read(&_s->ref_count)); \ - l2tp_session_inc_refcount_1(_s); \ -} while (0) -#define l2tp_session_dec_refcount(_s) \ -do { \ - pr_debug("l2tp_session_dec_refcount: %s:%d %s: cnt=%d\n", \ - __func__, __LINE__, (_s)->name, \ - refcount_read(&_s->ref_count)); \ - l2tp_session_dec_refcount_1(_s); \ -} while (0) -#else -#define l2tp_session_inc_refcount(s) l2tp_session_inc_refcount_1(s) -#define l2tp_session_dec_refcount(s) l2tp_session_dec_refcount_1(s) -#endif - #define l2tp_printk(ptr, type, func, fmt, ...) \ do { \ if (((ptr)->debug) & (type)) \ -- cgit v1.2.3 From c7fa745d988812c4dea7dbc645f025c5bfa4917e Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 31 Oct 2017 17:36:45 +0100 Subject: l2tp: remove l2tp_tunnel_count and l2tp_session_count These variables have never been used. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 216f49aec16f..7c8d1eb757a5 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -100,8 +100,6 @@ struct l2tp_skb_cb { #define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *) &skb->cb[sizeof(struct inet_skb_parm)]) -static atomic_t l2tp_tunnel_count; -static atomic_t l2tp_session_count; static struct workqueue_struct *l2tp_wq; /* per-net private data for this module */ @@ -359,10 +357,6 @@ int l2tp_session_register(struct l2tp_session *session, hlist_add_head(&session->hlist, head); write_unlock_bh(&tunnel->hlist_lock); - /* Ignore management session in session count value */ - if (session->session_id != 0) - atomic_inc(&l2tp_session_count); - return 0; err_tlock_pnlock: @@ -1251,7 +1245,6 @@ static void l2tp_tunnel_destruct(struct sock *sk) spin_lock_bh(&pn->l2tp_tunnel_list_lock); list_del_rcu(&tunnel->list); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - atomic_dec(&l2tp_tunnel_count); l2tp_tunnel_closeall(tunnel); @@ -1632,7 +1625,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 /* Add tunnel to our list */ INIT_LIST_HEAD(&tunnel->list); - atomic_inc(&l2tp_tunnel_count); /* Bump the reference count. The tunnel context is deleted * only when this drops to zero. Must be done before list insertion @@ -1678,8 +1670,6 @@ void l2tp_session_free(struct l2tp_session *session) if (tunnel) { BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); - if (session->session_id != 0) - atomic_dec(&l2tp_session_count); sock_put(tunnel->sock); session->tunnel = NULL; l2tp_tunnel_dec_refcount(tunnel); -- cgit v1.2.3 From 675080f2391f8b6ecb2c807e3eb7693e12847502 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 31 Oct 2017 17:36:46 +0100 Subject: l2tp: remove field 'dev' from struct l2tp_eth This field has never been used. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index d29bfee291cb..3e2dec1fb0f5 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -41,7 +41,6 @@ /* via netdev_priv() */ struct l2tp_eth { - struct net_device *dev; struct sock *tunnel_sock; struct l2tp_session *session; atomic_long_t tx_bytes; @@ -60,9 +59,6 @@ struct l2tp_eth_sess { static int l2tp_eth_dev_init(struct net_device *dev) { - struct l2tp_eth *priv = netdev_priv(dev); - - priv->dev = dev; eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); netdev_lockdep_set_classes(dev); @@ -315,7 +311,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, l2tp_eth_adjust_mtu(tunnel, session, dev); priv = netdev_priv(dev); - priv->dev = dev; priv->session = session; priv->tunnel_sock = tunnel->sock; -- cgit v1.2.3 From 5b32fe070c2ddf31adc42c26dab8af346b652538 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 27 Oct 2017 15:55:13 -0400 Subject: net: dsa: get ports within parsing code There is no point into hiding the -EINVAL error code in ERR_PTR from a dsa_get_ports function, simply get the "ports" node directly from within the dsa_parse_ports_dn function. This also has the effect to make the pdata and device tree handling code symmetrical inside _dsa_register_switch. At the same time, rename dsa_parse_ports_dn to dsa_parse_ports_of because _of is a more common suffix for device tree parsing functions. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 4d1ccf87c59c..9d57f8dee9a1 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -590,11 +590,17 @@ static int dsa_dst_parse(struct dsa_switch_tree *dst) return 0; } -static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) +static int dsa_parse_ports_of(struct device_node *dn, struct dsa_switch *ds) { - struct device_node *port; - int err; + struct device_node *ports, *port; u32 reg; + int err; + + ports = of_get_child_by_name(dn, "ports"); + if (!ports) { + dev_err(ds->dev, "no ports child node found\n"); + return -EINVAL; + } for_each_available_child_of_node(ports, port) { err = of_property_read_u32(port, "reg", ®); @@ -665,26 +671,11 @@ static int dsa_parse_member(struct dsa_chip_data *pd, u32 *tree, u32 *index) return 0; } -static struct device_node *dsa_get_ports(struct dsa_switch *ds, - struct device_node *np) -{ - struct device_node *ports; - - ports = of_get_child_by_name(np, "ports"); - if (!ports) { - dev_err(ds->dev, "no ports child node found\n"); - return ERR_PTR(-EINVAL); - } - - return ports; -} - static int _dsa_register_switch(struct dsa_switch *ds) { struct dsa_chip_data *pdata = ds->dev->platform_data; struct device_node *np = ds->dev->of_node; struct dsa_switch_tree *dst; - struct device_node *ports; u32 tree, index; int i, err; @@ -693,11 +684,7 @@ static int _dsa_register_switch(struct dsa_switch *ds) if (err) return err; - ports = dsa_get_ports(ds, np); - if (IS_ERR(ports)) - return PTR_ERR(ports); - - err = dsa_parse_ports_dn(ports, ds); + err = dsa_parse_ports_of(np, ds); if (err) return err; } else { -- cgit v1.2.3 From fd223e2e66eb076b5dda586db9a5a3c99f76f99a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 27 Oct 2017 15:55:14 -0400 Subject: net: dsa: add port parse functions Add symmetrical DSA port parsing functions for pdata and device tree, used to parse and validate a given port node or platform data. They don't do much for the moment but will be extended later on to assign a port type and get device references. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 9d57f8dee9a1..a0ee91cd3814 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -590,9 +590,17 @@ static int dsa_dst_parse(struct dsa_switch_tree *dst) return 0; } +static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) +{ + dp->dn = dn; + + return 0; +} + static int dsa_parse_ports_of(struct device_node *dn, struct dsa_switch *ds) { struct device_node *ports, *port; + struct dsa_port *dp; u32 reg; int err; @@ -610,22 +618,45 @@ static int dsa_parse_ports_of(struct device_node *dn, struct dsa_switch *ds) if (reg >= ds->num_ports) return -EINVAL; - ds->ports[reg].dn = port; + dp = &ds->ports[reg]; + + err = dsa_port_parse_of(dp, port); + if (err) + return err; } return 0; } +static int dsa_port_parse(struct dsa_port *dp, const char *name, + struct device *dev) +{ + dp->name = name; + + return 0; +} + static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds) { bool valid_name_found = false; + struct dsa_port *dp; + struct device *dev; + const char *name; unsigned int i; + int err; for (i = 0; i < DSA_MAX_PORTS; i++) { - if (!cd->port_names[i]) + name = cd->port_names[i]; + dev = cd->netdev[i]; + dp = &ds->ports[i]; + + if (!name) continue; - ds->ports[i].name = cd->port_names[i]; + err = dsa_port_parse(dp, name, dev); + if (err) + return err; + valid_name_found = true; } -- cgit v1.2.3 From 6d4e5c570c2d66c806ecc6bd851fcf881fe8a38e Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 27 Oct 2017 15:55:15 -0400 Subject: net: dsa: get port type at parse time Assign a port's type at parsed time instead of waiting for the tree to be completed. Because this is now done earlier, we can use the port's type in dsa_port_is_* helpers instead of digging again in topology description. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index a0ee91cd3814..895c38427ef0 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -87,23 +87,17 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst, */ static bool dsa_port_is_valid(struct dsa_port *port) { - return !!(port->dn || port->name); + return port->type != DSA_PORT_TYPE_UNUSED; } static bool dsa_port_is_dsa(struct dsa_port *port) { - if (port->name && !strcmp(port->name, "dsa")) - return true; - else - return !!of_parse_phandle(port->dn, "link", 0); + return port->type == DSA_PORT_TYPE_DSA; } static bool dsa_port_is_cpu(struct dsa_port *port) { - if (port->name && !strcmp(port->name, "cpu")) - return true; - else - return !!of_parse_phandle(port->dn, "ethernet", 0); + return port->type == DSA_PORT_TYPE_CPU; } static bool dsa_ds_find_port_dn(struct dsa_switch *ds, @@ -183,8 +177,6 @@ static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) err = dsa_port_complete(dst, ds, port, index); if (err != 0) return err; - - port->type = DSA_PORT_TYPE_DSA; } return 0; @@ -500,8 +492,6 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, dst->cpu_dp->master = ethernet_dev; } - port->type = DSA_PORT_TYPE_CPU; - tag_protocol = ds->ops->get_tag_protocol(ds); tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(tag_ops)) { @@ -534,8 +524,6 @@ static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) err = dsa_cpu_parse(port, index, dst, ds); if (err) return err; - } else { - port->type = DSA_PORT_TYPE_USER; } } @@ -592,6 +580,17 @@ static int dsa_dst_parse(struct dsa_switch_tree *dst) static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) { + struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0); + struct device_node *link = of_parse_phandle(dn, "link", 0); + + if (ethernet) { + dp->type = DSA_PORT_TYPE_CPU; + } else if (link) { + dp->type = DSA_PORT_TYPE_DSA; + } else { + dp->type = DSA_PORT_TYPE_USER; + } + dp->dn = dn; return 0; @@ -631,6 +630,14 @@ static int dsa_parse_ports_of(struct device_node *dn, struct dsa_switch *ds) static int dsa_port_parse(struct dsa_port *dp, const char *name, struct device *dev) { + if (!strcmp(name, "cpu")) { + dp->type = DSA_PORT_TYPE_CPU; + } else if (!strcmp(name, "dsa")) { + dp->type = DSA_PORT_TYPE_DSA; + } else { + dp->type = DSA_PORT_TYPE_USER; + } + dp->name = name; return 0; -- cgit v1.2.3 From cbabb0ac01052f79cf96d7b7e3d3451ffd275864 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 27 Oct 2017 15:55:17 -0400 Subject: net: dsa: get master device at port parsing time Fetching the master device can be done directly when a port is parsed from device tree or pdata, instead of waiting until dsa_dst_parse. Now that -EPROBE_DEFER is returned before we add the switch to the tree, there is no need to check for this error after dsa_dst_parse. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 895c38427ef0..8cd84c1b3dc0 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -470,27 +470,9 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, { const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; - struct net_device *ethernet_dev; - struct device_node *ethernet; - if (port->dn) { - ethernet = of_parse_phandle(port->dn, "ethernet", 0); - if (!ethernet) - return -EINVAL; - ethernet_dev = of_find_net_device_by_node(ethernet); - if (!ethernet_dev) - return -EPROBE_DEFER; - } else { - ethernet_dev = dsa_dev_to_net_device(ds->cd->netdev[index]); - if (!ethernet_dev) - return -EPROBE_DEFER; - dev_put(ethernet_dev); - } - - if (!dst->cpu_dp) { + if (!dst->cpu_dp) dst->cpu_dp = port; - dst->cpu_dp->master = ethernet_dev; - } tag_protocol = ds->ops->get_tag_protocol(ds); tag_ops = dsa_resolve_tag_protocol(tag_protocol); @@ -584,7 +566,14 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) struct device_node *link = of_parse_phandle(dn, "link", 0); if (ethernet) { + struct net_device *master; + + master = of_find_net_device_by_node(ethernet); + if (!master) + return -EPROBE_DEFER; + dp->type = DSA_PORT_TYPE_CPU; + dp->master = master; } else if (link) { dp->type = DSA_PORT_TYPE_DSA; } else { @@ -631,7 +620,16 @@ static int dsa_port_parse(struct dsa_port *dp, const char *name, struct device *dev) { if (!strcmp(name, "cpu")) { + struct net_device *master; + + master = dsa_dev_to_net_device(dev); + if (!master) + return -EPROBE_DEFER; + + dev_put(master); + dp->type = DSA_PORT_TYPE_CPU; + dp->master = master; } else if (!strcmp(name, "dsa")) { dp->type = DSA_PORT_TYPE_DSA; } else { @@ -773,14 +771,8 @@ static int _dsa_register_switch(struct dsa_switch *ds) } err = dsa_dst_parse(dst); - if (err) { - if (err == -EPROBE_DEFER) { - dsa_dst_del_ds(dst, ds, ds->index); - return err; - } - + if (err) goto out_del_dst; - } err = dsa_dst_apply(dst); if (err) { -- cgit v1.2.3 From 1838fa89a22cbc9ec87e995683e241a82d87e6df Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 27 Oct 2017 15:55:18 -0400 Subject: net: dsa: get port name at parse time Get the optional "label" property and assign a default one directly at parse time instead of doing it when creating the slave. For legacy, simply assign the port name stored in cd->port_names. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 10 +++++----- net/dsa/legacy.c | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 8cd84c1b3dc0..3c134ff26863 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -260,11 +260,6 @@ static int dsa_user_port_apply(struct dsa_port *port) const char *name = port->name; int err; - if (port->dn) - name = of_get_property(port->dn, "label", NULL); - if (!name) - name = "eth%d"; - err = dsa_slave_create(port, name); if (err) { dev_warn(ds->dev, "Failed to create slave %d: %d\n", @@ -564,6 +559,7 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) { struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0); struct device_node *link = of_parse_phandle(dn, "link", 0); + const char *name = of_get_property(dn, "label", NULL); if (ethernet) { struct net_device *master; @@ -577,7 +573,11 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) } else if (link) { dp->type = DSA_PORT_TYPE_DSA; } else { + if (!name) + name = "eth%d"; + dp->type = DSA_PORT_TYPE_USER; + dp->name = name; } dp->dn = dn; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index ed7aae342fca..afe6e1539bd0 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -115,6 +115,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, name = cd->port_names[i]; if (name == NULL) continue; + dp->name = name; if (!strcmp(name, "cpu")) { if (dst->cpu_dp) { -- cgit v1.2.3 From 951259aa60180e2897d28b538bf68a3a213da471 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 27 Oct 2017 15:55:19 -0400 Subject: net: dsa: remove name arg from slave create Now that slave dsa_port always have their name set, there is no need to pass it to dsa_slave_create() anymore. Remove this argument. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 3 +-- net/dsa/dsa_priv.h | 2 +- net/dsa/legacy.c | 2 +- net/dsa/slave.c | 3 ++- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 3c134ff26863..797d1156b4e6 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -257,10 +257,9 @@ static void dsa_cpu_port_unapply(struct dsa_port *port) static int dsa_user_port_apply(struct dsa_port *port) { struct dsa_switch *ds = port->ds; - const char *name = port->name; int err; - err = dsa_slave_create(port, name); + err = dsa_slave_create(port); if (err) { dev_warn(ds->dev, "Failed to create slave %d: %d\n", port->index, err); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 1e65afd6989e..253a613c40cd 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -163,7 +163,7 @@ void dsa_port_fixed_link_unregister_of(struct dsa_port *dp); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); -int dsa_slave_create(struct dsa_port *port, const char *name); +int dsa_slave_create(struct dsa_port *dp); void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index afe6e1539bd0..0511fe2feff7 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -197,7 +197,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, if (dsa_is_user_port(ds, i)) continue; - ret = dsa_slave_create(&ds->ports[i], cd->port_names[i]); + ret = dsa_slave_create(&ds->ports[i]); if (ret < 0) netdev_err(master, "[%d]: can't create dsa slave device for port %d(%s): %d\n", index, i, cd->port_names[i], ret); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 808e205227c3..48b954a76b0d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1142,11 +1142,12 @@ static void dsa_slave_notify(struct net_device *dev, unsigned long val) call_dsa_notifiers(val, dev, &rinfo.info); } -int dsa_slave_create(struct dsa_port *port, const char *name) +int dsa_slave_create(struct dsa_port *port) { struct dsa_port *cpu_dp = port->cpu_dp; struct net_device *master = cpu_dp->master; struct dsa_switch *ds = port->ds; + const char *name = port->name; struct net_device *slave_dev; struct dsa_slave_priv *p; int ret; -- cgit v1.2.3 From 4f6265d485ea0a2507692ded8ed47b323f49587c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 27 Oct 2017 17:37:12 -0700 Subject: netlink: Allow ext_ack to carry non-error messages The NLMSGERR API already carries data (eg, a cookie) on the success path. Allow a message string to be returned as well. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 767c84e10e20..26ded4239611 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2313,17 +2313,16 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, * requests to cap the error message, and get extra error data if * requested. */ + if (nlk_has_extack && extack && extack->_msg) + tlvlen += nla_total_size(strlen(extack->_msg) + 1); + if (err) { if (!(nlk->flags & NETLINK_F_CAP_ACK)) payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; - if (nlk_has_extack && extack) { - if (extack->_msg) - tlvlen += nla_total_size(strlen(extack->_msg) + 1); - if (extack->bad_attr) - tlvlen += nla_total_size(sizeof(u32)); - } + if (nlk_has_extack && extack && extack->bad_attr) + tlvlen += nla_total_size(sizeof(u32)); } else { flags |= NLM_F_CAPPED; @@ -2348,10 +2347,11 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); if (nlk_has_extack && extack) { + if (extack->_msg) { + WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, + extack->_msg)); + } if (err) { - if (extack->_msg) - WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, - extack->_msg)); if (extack->bad_attr && !WARN_ON((u8 *)extack->bad_attr < in_skb->data || (u8 *)extack->bad_attr >= in_skb->data + -- cgit v1.2.3 From 6c31e5a91fde2e718e59c8a627c56451f88be54c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 27 Oct 2017 17:37:13 -0700 Subject: net: Add extack to fib_notifier_info Add extack to fib_notifier_info and plumb through stack to call_fib_rule_notifiers, call_fib_entry_notifiers and call_fib6_entry_notifiers. This allows notifer handlers to return messages to user. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/fib_rules.c | 9 ++++++--- net/ipv4/fib_trie.c | 13 ++++++++----- net/ipv6/ip6_fib.c | 15 +++++++++------ 3 files changed, 23 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9a6d97c1d810..fafd0a41e3f7 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -314,10 +314,12 @@ static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, - struct fib_rules_ops *ops) + struct fib_rules_ops *ops, + struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, + .info.extack = extack, .rule = rule, }; @@ -609,7 +611,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); - call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -749,7 +751,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } } - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, + NULL); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index aaa1ba09afaa..5ddc4aefff12 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -102,9 +102,11 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_alias *fa) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, @@ -1214,7 +1216,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa); + key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1269,7 +1271,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, new_fa); + call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1569,7 +1571,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete); + fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1886,7 +1888,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb) call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, - KEYLENGTH - fa->fa_slen, fa); + KEYLENGTH - fa->fa_slen, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1ada9672d198..2e2804f5823e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -353,9 +353,11 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, static int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, - struct rt6_info *rt) + struct rt6_info *rt, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { + .info.extack = extack, .rt = rt, }; @@ -868,7 +870,8 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc) + struct nl_info *info, struct mx6_config *mxc, + struct netlink_ext_ack *extack) { struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1011,7 +1014,7 @@ add: rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, - rt); + rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; @@ -1040,7 +1043,7 @@ add: rt->dst.rt6_next = iter->dst.rt6_next; rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, - rt); + rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { @@ -1225,7 +1228,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, mxc); + err = fib6_add_rt2node(fn, rt, info, mxc, extack); if (!err) { fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); @@ -1686,7 +1689,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt); + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); -- cgit v1.2.3 From db32919005d81a375093936a1f625f2d08c2d299 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 19:46:21 +0800 Subject: ip_vti: remove the useless err_count check in vti_xmit Unlike ipip and gre, ip_vti never uses err_count in vti4_err, so no need to check err_count in vti_xmit, it's value always 0. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 58465c0a8682..949f432a5f04 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -198,15 +198,6 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { - tunnel->err_count--; - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } - mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); -- cgit v1.2.3 From 68ad08c4f8ee6254ab814d15b9765f7f82ed0ae7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 28 Oct 2017 14:38:45 -0500 Subject: net: decnet: dn_nsp_in: use swap macro in dn_nsp_rx_packet Make use of the swap macro and remove unnecessary variable tmp. This makes the code easier to read and maintain. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/decnet/dn_nsp_in.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 7ac086d5c0c0..1b2120645730 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -776,12 +776,8 @@ static int dn_nsp_rx_packet(struct net *net, struct sock *sk2, * Swap src & dst and look up in the normal way. */ if (unlikely(cb->rt_flags & DN_RT_F_RTS)) { - __le16 tmp = cb->dst_port; - cb->dst_port = cb->src_port; - cb->src_port = tmp; - tmp = cb->dst; - cb->dst = cb->src; - cb->src = tmp; + swap(cb->dst_port, cb->src_port); + swap(cb->dst, cb->src); } /* -- cgit v1.2.3 From 3a7943ba5b7763488753305fddb0a50e911e0617 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 28 Oct 2017 15:39:48 -0500 Subject: net: decnet: dn_nsp_out: use swap macro in dn_mk_ack_header Make use of the swap macro and remove unnecessary variable tmp. This makes the code easier to read and maintain. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/decnet/dn_nsp_out.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index e50a4adfcf7e..56a52a004c56 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -313,11 +313,8 @@ static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned c ackcrs |= 0x8000; /* If this is an "other data/ack" message, swap acknum and ackcrs */ - if (other) { - unsigned short tmp = acknum; - acknum = ackcrs; - ackcrs = tmp; - } + if (other) + swap(acknum, ackcrs); /* Set "cross subchannel" bit in ackcrs */ ackcrs |= 0x2000; -- cgit v1.2.3 From 54df7ef51193641010a934c231c5aba117aea46d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 28 Oct 2017 15:48:47 -0500 Subject: net: dccp: ccids: lib: packet_history: use swap macro in tfrc_rx_hist_swap Make use of the swap macro and remove unnecessary variable tmp. This makes the code easier to read and maintain. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/dccp/ccids/lib/packet_history.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 08df7a3acb3d..876e18592d71 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -149,10 +149,8 @@ static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) { const u8 idx_a = tfrc_rx_hist_index(h, a), idx_b = tfrc_rx_hist_index(h, b); - struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; - h->ring[idx_a] = h->ring[idx_b]; - h->ring[idx_b] = tmp; + swap(h->ring[idx_a], h->ring[idx_b]); } /* -- cgit v1.2.3 From 1a3fbd3fdec55007f1b18cf96c4a9e486c6e875b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 30 Oct 2017 12:56:33 +0200 Subject: net: bridge: add neigh_suppress to bridge port policies Add an entry for IFLA_BRPORT_NEIGH_SUPPRESS to bridge port policies. Fixes: 821f1b21cabb ("bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd flood") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e732403669c6..26aeb0b5cf30 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -651,6 +651,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, + [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ -- cgit v1.2.3 From 3051fbec206eb6967b7fdecedb63ebb1ed67a1a7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Oct 2017 10:07:17 -0700 Subject: net: sit: Update lookup to handle links set to L3 slave Using SIT tunnels with VRFs works fine if the underlay device is in a VRF and the link parameter is set to the VRF device. e.g., ip tunnel add jtun mode sit remote local dev myvrf Update the device check to allow the link to be the enslaved device as well. e.g., ip tunnel add jtun mode sit remote local dev eth4 where eth4 is enslaved to myvrf. Reported-by: Jeff Barnhill <0xeffeff@gmail.com> Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/sit.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a799f5258614..d60ddcb0bfe2 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -91,29 +91,35 @@ struct sit_net { * Must be invoked with rcu_read_lock */ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, - struct net_device *dev, __be32 remote, __be32 local) + struct net_device *dev, + __be32 remote, __be32 local, + int sifindex) { unsigned int h0 = HASH(remote); unsigned int h1 = HASH(local); struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); + int ifindex = dev ? dev->ifindex : 0; for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->ifindex == t->parms.link) && + (!dev || !t->parms.link || ifindex == t->parms.link || + sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->ifindex == t->parms.link) && + (!dev || !t->parms.link || ifindex == t->parms.link || + sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && - (!dev || !t->parms.link || dev->ifindex == t->parms.link) && + (!dev || !t->parms.link || ifindex == t->parms.link || + sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } @@ -486,6 +492,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) const int code = icmp_hdr(skb)->code; unsigned int data_len = 0; struct ip_tunnel *t; + int sifindex; int err; switch (type) { @@ -517,10 +524,9 @@ static int ipip6_err(struct sk_buff *skb, u32 info) err = -ENOENT; - t = ipip6_tunnel_lookup(dev_net(skb->dev), - skb->dev, - iph->daddr, - iph->saddr); + sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; + t = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, + iph->daddr, iph->saddr, sifindex); if (!t) goto out; @@ -633,10 +639,12 @@ static int ipip6_rcv(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct ip_tunnel *tunnel; + int sifindex; int err; + sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, - iph->saddr, iph->daddr); + iph->saddr, iph->daddr, sifindex); if (tunnel) { struct pcpu_sw_netstats *tstats; @@ -704,10 +712,13 @@ static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { const struct iphdr *iph; struct ip_tunnel *tunnel; + int sifindex; + + sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; iph = ip_hdr(skb); tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, - iph->saddr, iph->daddr); + iph->saddr, iph->daddr, sifindex); if (tunnel) { const struct tnl_ptk_info *tpi; -- cgit v1.2.3 From 31b102bb501bea50ebc10f4aecf9d788305b8b87 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 30 Oct 2017 14:06:45 -0700 Subject: net: tipc: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Jon Maloy Cc: Ying Xue Cc: "David S. Miller" Cc: netdev@vger.kernel.org Cc: tipc-discussion@lists.sourceforge.net Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/tipc/discover.c | 6 +++--- net/tipc/monitor.c | 6 +++--- net/tipc/node.c | 8 ++++---- net/tipc/socket.c | 10 +++++----- net/tipc/subscr.c | 6 +++--- 5 files changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 02462d67d191..92e4828c6b09 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -224,9 +224,9 @@ void tipc_disc_remove_dest(struct tipc_link_req *req) * * Called whenever a link setup request timer associated with a bearer expires. */ -static void disc_timeout(unsigned long data) +static void disc_timeout(struct timer_list *t) { - struct tipc_link_req *req = (struct tipc_link_req *)data; + struct tipc_link_req *req = from_timer(req, t, timer); struct sk_buff *skb; int max_delay; @@ -292,7 +292,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b, req->num_nodes = 0; req->timer_intv = TIPC_LINK_REQ_INIT; spin_lock_init(&req->lock); - setup_timer(&req->timer, disc_timeout, (unsigned long)req); + timer_setup(&req->timer, disc_timeout, 0); mod_timer(&req->timer, jiffies + req->timer_intv); b->link_req = req; *skb = skb_clone(req->buf, GFP_ATOMIC); diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 9e109bb1a207..b9c32557d73c 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -578,9 +578,9 @@ void tipc_mon_get_state(struct net *net, u32 addr, read_unlock_bh(&mon->lock); } -static void mon_timeout(unsigned long m) +static void mon_timeout(struct timer_list *t) { - struct tipc_monitor *mon = (void *)m; + struct tipc_monitor *mon = from_timer(mon, t, timer); struct tipc_peer *self; int best_member_cnt = dom_size(mon->peer_cnt) - 1; @@ -623,7 +623,7 @@ int tipc_mon_create(struct net *net, int bearer_id) self->is_up = true; self->is_head = true; INIT_LIST_HEAD(&self->list); - setup_timer(&mon->timer, mon_timeout, (unsigned long)mon); + timer_setup(&mon->timer, mon_timeout, 0); mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff)); mod_timer(&mon->timer, jiffies + mon->timer_intv); return 0; diff --git a/net/tipc/node.c b/net/tipc/node.c index 89f8ac73bf65..009a81631280 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -153,7 +153,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete); static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); -static void tipc_node_timeout(unsigned long data); +static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static void tipc_node_put(struct tipc_node *node); @@ -361,7 +361,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) goto exit; } tipc_node_get(n); - setup_timer(&n->timer, tipc_node_timeout, (unsigned long)n); + timer_setup(&n->timer, tipc_node_timeout, 0); n->keepalive_intv = U32_MAX; hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { @@ -500,9 +500,9 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) /* tipc_node_timeout - handle expiration of node timer */ -static void tipc_node_timeout(unsigned long data) +static void tipc_node_timeout(struct timer_list *t) { - struct tipc_node *n = (struct tipc_node *)data; + struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; int bearer_id; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ea61c32f6b80..5d18c0caa92b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -125,7 +125,7 @@ static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern); -static void tipc_sk_timeout(unsigned long data); +static void tipc_sk_timeout(struct timer_list *t); static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, @@ -464,7 +464,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, NAMED_H_SIZE, 0); msg_set_origport(msg, tsk->portid); - setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); + timer_setup(&sk->sk_timer, tipc_sk_timeout, 0); sk->sk_shutdown = 0; sk->sk_backlog_rcv = tipc_sk_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; @@ -2530,14 +2530,14 @@ static int tipc_shutdown(struct socket *sock, int how) return res; } -static void tipc_sk_timeout(unsigned long data) +static void tipc_sk_timeout(struct timer_list *t) { - struct tipc_sock *tsk = (struct tipc_sock *)data; + struct sock *sk = from_timer(sk, t, sk_timer); + struct tipc_sock *tsk = tipc_sk(sk); u32 peer_port = tsk_peer_port(tsk); u32 peer_node = tsk_peer_node(tsk); u32 own_node = tsk_own_node(tsk); u32 own_port = tsk->portid; - struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index be3d9e3183dc..251065dfd8df 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -133,9 +133,9 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, node); } -static void tipc_subscrp_timeout(unsigned long data) +static void tipc_subscrp_timeout(struct timer_list *t) { - struct tipc_subscription *sub = (struct tipc_subscription *)data; + struct tipc_subscription *sub = from_timer(sub, t, timer); struct tipc_subscriber *subscriber = sub->subscriber; spin_lock_bh(&subscriber->lock); @@ -303,7 +303,7 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, tipc_subscrb_get(subscriber); spin_unlock_bh(&subscriber->lock); - setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); + timer_setup(&sub->timer, tipc_subscrp_timeout, 0); timeout = htohl(sub->evt.s.timeout, swap); if (timeout != TIPC_WAIT_FOREVER) -- cgit v1.2.3 From 4c31606920bab227de10d2c9bf9b4dd8d4327638 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 27 Oct 2017 00:51:04 -0500 Subject: net: netrom: nr_route: refactor code in nr_add_node Code refactoring in order to make the code easier to read and maintain. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/netrom/nr_route.c | 59 ++++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 0c59354e280e..fba4b4c8efaf 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -80,6 +80,19 @@ static struct nr_neigh *nr_neigh_get_dev(ax25_address *callsign, static void nr_remove_neigh(struct nr_neigh *); +/* re-sort the routes in quality order. */ +static void re_sort_routes(struct nr_node *nr_node, int x, int y) +{ + if (nr_node->routes[y].quality > nr_node->routes[x].quality) { + if (nr_node->which == x) + nr_node->which = y; + else if (nr_node->which == y) + nr_node->which = x; + + swap(nr_node->routes[x], nr_node->routes[y]); + } +} + /* * Add a new route to a node, and in the process add the node and the * neighbour if it is new. @@ -90,7 +103,6 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, { struct nr_node *nr_node; struct nr_neigh *nr_neigh; - struct nr_route nr_route; int i, found; struct net_device *odev; @@ -251,49 +263,10 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, /* Now re-sort the routes in quality order */ switch (nr_node->count) { case 3: - if (nr_node->routes[1].quality > nr_node->routes[0].quality) { - switch (nr_node->which) { - case 0: - nr_node->which = 1; - break; - case 1: - nr_node->which = 0; - break; - } - nr_route = nr_node->routes[0]; - nr_node->routes[0] = nr_node->routes[1]; - nr_node->routes[1] = nr_route; - } - if (nr_node->routes[2].quality > nr_node->routes[1].quality) { - switch (nr_node->which) { - case 1: nr_node->which = 2; - break; - - case 2: nr_node->which = 1; - break; - - default: - break; - } - nr_route = nr_node->routes[1]; - nr_node->routes[1] = nr_node->routes[2]; - nr_node->routes[2] = nr_route; - } + re_sort_routes(nr_node, 0, 1); + re_sort_routes(nr_node, 1, 2); case 2: - if (nr_node->routes[1].quality > nr_node->routes[0].quality) { - switch (nr_node->which) { - case 0: nr_node->which = 1; - break; - - case 1: nr_node->which = 0; - break; - - default: break; - } - nr_route = nr_node->routes[0]; - nr_node->routes[0] = nr_node->routes[1]; - nr_node->routes[1] = nr_route; - } + re_sort_routes(nr_node, 0, 1); case 1: break; } -- cgit v1.2.3 From 31f74f0f4e8dfe5b30ff59d9a38a370fbb725f38 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 27 Oct 2017 00:51:08 -0500 Subject: net: netrom: nr_route: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/netrom/nr_route.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index fba4b4c8efaf..75e6ba970fde 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -265,6 +265,7 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, case 3: re_sort_routes(nr_node, 0, 1); re_sort_routes(nr_node, 1, 2); + /* fall through */ case 2: re_sort_routes(nr_node, 0, 1); case 1: @@ -357,6 +358,7 @@ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct n switch (i) { case 0: nr_node->routes[0] = nr_node->routes[1]; + /* fall through */ case 1: nr_node->routes[1] = nr_node->routes[2]; case 2: @@ -526,6 +528,7 @@ void nr_rt_device_down(struct net_device *dev) switch (i) { case 0: t->routes[0] = t->routes[1]; + /* fall through */ case 1: t->routes[1] = t->routes[2]; case 2: -- cgit v1.2.3 From da13c59b9936dfedcf9f2203bd29fbf83ad672bf Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Mon, 30 Oct 2017 19:38:52 -0400 Subject: net: display hw address of source machine during ipv6 DAD failure This patch updates the error messages displayed in kernel log to include hwaddress of the source machine that caused ipv6 duplicate address detection failures. Examples: a) When we receive a NA packet from another machine advertising our address: ICMPv6: NA: 34:ab:cd:56:11:e8 advertised our address 2001:db8:: on eth0! b) When we detect DAD failure during address assignment to an interface: IPv6: eth0: IPv6 duplicate address 2001:db8:: used by 34:ab:cd:56:11:e8 detected! v2: Changed %pI6 to %pI6c in ndisc_recv_na() Chaged the v6 address in the commit message to 2001:db8:: Suggested-by: Igor Lubashev Signed-off-by: Vishwanath Pai Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- net/ipv6/ndisc.c | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5a8a10229a07..cfa374c8b54c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1987,7 +1987,7 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp) return err; } -void addrconf_dad_failure(struct inet6_ifaddr *ifp) +void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net *net = dev_net(ifp->idev->dev); @@ -1997,8 +1997,8 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) return; } - net_info_ratelimited("%s: IPv6 duplicate address %pI6c detected!\n", - ifp->idev->dev->name, &ifp->addr); + net_info_ratelimited("%s: IPv6 duplicate address %pI6c used by %pM detected!\n", + ifp->idev->dev->name, &ifp->addr, eth_hdr(skb)->h_source); spin_lock_bh(&ifp->lock); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 266a530414d7..f9c3ffe04382 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -46,6 +46,7 @@ #endif #include +#include #include #include #include @@ -822,7 +823,7 @@ have_ifp: * who is doing DAD * so fail our DAD process */ - addrconf_dad_failure(ifp); + addrconf_dad_failure(skb, ifp); return; } else { /* @@ -975,7 +976,7 @@ static void ndisc_recv_na(struct sk_buff *skb) if (ifp) { if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { - addrconf_dad_failure(ifp); + addrconf_dad_failure(skb, ifp); return; } /* What should we make now? The advertisement @@ -989,8 +990,8 @@ static void ndisc_recv_na(struct sk_buff *skb) */ if (skb->pkt_type != PACKET_LOOPBACK) ND_PRINTK(1, warn, - "NA: someone advertises our address %pI6 on %s!\n", - &ifp->addr, ifp->idev->dev->name); + "NA: %pM advertised our address %pI6c on %s!\n", + eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return; } -- cgit v1.2.3 From dc82673f0cb5175dcec87041441cdb107932cd07 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 31 Oct 2017 13:28:16 +0000 Subject: sctp: fix error return code in sctp_send_add_streams() Fix to returnerror code -ENOMEM from the sctp_make_strreset_addstrm() error handling case instead of 0. 'retval' can be overwritten to 0 after call sctp_stream_alloc_out(). Fixes: e090abd0d81c ("sctp: factor out stream->out allocation") Signed-off-by: Wei Yongjun Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/stream.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 03764fc2b652..b8c8cabb1a58 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -393,7 +393,7 @@ int sctp_send_add_streams(struct sctp_association *asoc, { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; - int retval = -ENOMEM; + int retval; __u32 outcnt, incnt; __u16 out, in; @@ -425,8 +425,10 @@ int sctp_send_add_streams(struct sctp_association *asoc, } chunk = sctp_make_strreset_addstrm(asoc, out, in); - if (!chunk) + if (!chunk) { + retval = -ENOMEM; goto out; + } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); -- cgit v1.2.3 From 032a480202245e384fdbcac92da720d697384d8e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 31 Oct 2017 14:32:38 +0100 Subject: ipv4: fix validate_source for VRF setup David reported breakages of VRF scenarios due to the commit 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled."): the local addresses based test is too strict when VRFs are in place. With this change we fall-back to a full lookup when custom fib rules are in place; so that we address the VRF use case and possibly other similar issues in non trivial setups. v1 -> v2: - fix build breakage when CONFIG_IP_MULTIPLE_TABLES is not defined, reported by the kbuild test robot Reported-by: David Ahern Fixes: 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.") Signed-off-by: Paolo Abeni Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f02819134ba2..f52d27a422c3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -73,6 +73,11 @@ fail: fib_free_table(main_table); return -ENOMEM; } + +static bool fib4_has_custom_rules(struct net *net) +{ + return false; +} #else struct fib_table *fib_new_table(struct net *net, u32 id) @@ -128,6 +133,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } return NULL; } + +static bool fib4_has_custom_rules(struct net *net) +{ + return net->ipv4.fib_has_custom_rules; +} #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, @@ -405,10 +415,12 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { if (IN_DEV_ACCEPT_LOCAL(idev)) goto ok; - /* if no local routes are added from user space we can check - * for local addresses looking-up the ifaddr table + /* with custom local routes in place, checking local addresses + * only will be too optimistic, with custom rules, checking + * local addresses only can be too strict, e.g. due to vrf */ - if (net->ipv4.fib_has_custom_local_routes) + if (net->ipv4.fib_has_custom_local_routes || + fib4_has_custom_rules(net)) goto full_check; if (inet_lookup_ifaddr_rcu(net, src)) return -EINVAL; -- cgit v1.2.3 From 909fb9ae322154648d7354bcf7cdd0f705fb67f1 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Tue, 31 Oct 2017 15:48:00 +0100 Subject: net: dsa: lan9303: Transmit using ALR when unicast lan9303_xmit_use_arl() introduced in previous patch set is wrong. The chip flood broadcast and unknown multicast frames. The effect is that broadcasts and multicasts are duplicated on egress. It is not possible to configure the chip to direct unknown multicasts to CPU port only. This means that only unicast frames can be transmitted using ALR lookup. Signed-off-by: Egil Hjelmeland Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/tag_lan9303.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 64092325aac3..537ca991fafe 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -46,7 +46,7 @@ /* Decide whether to transmit using ALR lookup, or transmit directly to * port using tag. ALR learning is performed only when using ALR lookup. - * If the two external ports are bridged and the packet is not STP BPDU, + * If the two external ports are bridged and the frame is unicast, * then use ALR lookup to allow ALR learning on CPU port. * Otherwise transmit directly to port with STP state override. * See also: lan9303_separate_ports() and lan9303.pdf 6.4.10.1 @@ -55,7 +55,7 @@ static int lan9303_xmit_use_arl(struct dsa_port *dp, u8 *dest_addr) { struct lan9303 *chip = dp->ds->priv; - return chip->is_bridged && !ether_addr_equal(dest_addr, eth_stp_addr); + return chip->is_bridged && !is_multicast_ether_addr(dest_addr); } static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From e9292f2c03851ef81bef38579a0ee9c42140e586 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Tue, 31 Oct 2017 15:48:01 +0100 Subject: net: dsa: lan9303: Add STP ALR entry on port 0 STP BPDUs arriving on user ports must sent to CPU port only, for processing by the SW bridge. Add an ALR entry with STP state override to fix that. Signed-off-by: Egil Hjelmeland Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/tag_lan9303.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 537ca991fafe..18f45cd9f625 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -42,7 +42,6 @@ #define LAN9303_TAG_LEN 4 # define LAN9303_TAG_TX_USE_ALR BIT(3) # define LAN9303_TAG_TX_STP_OVERRIDE BIT(4) -#define eth_stp_addr eth_reserved_addr_base /* Decide whether to transmit using ALR lookup, or transmit directly to * port using tag. ALR learning is performed only when using ALR lookup. -- cgit v1.2.3 From f849772915e5501c25f69f4815daa8cd86194634 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Tue, 31 Oct 2017 15:48:02 +0100 Subject: net: dsa: lan9303: lan9303_rcv set skb->offload_fwd_mark The chip flood broadcast and unknown multicast frames. On receive set skb->offload_fwd_mark to prevent the SW from flooding to the same ports. One exception: Because the ALR is set up to forward STP BPDUs only to CPU, the SW bridge should flood STP BPDUs if local STP is not enabled. This is archived by not setting skb->offload_fwd_mark on STP BPDUs. Signed-off-by: Egil Hjelmeland Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/tag_lan9303.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 18f45cd9f625..e526c8967b98 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -126,6 +126,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, skb_pull_rcsum(skb, 2 + 2); memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), 2 * ETH_ALEN); + skb->offload_fwd_mark = !ether_addr_equal(skb->data - ETH_HLEN, + eth_stp_addr); return skb; } -- cgit v1.2.3 From 928990631327cf00a9195e30fa22f7ae5f8d7e67 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 1 Nov 2017 12:18:13 +0200 Subject: net: bridge: add notifications for the bridge dev on vlan change Currently the bridge device doesn't generate any notifications upon vlan modifications on itself because it doesn't use the generic bridge notifications. With the recent changes we know if anything was modified in the vlan config thus we can generate a notification when necessary for the bridge device so add support to br_ifinfo_notify() similar to how other combined functions are done - if port is present it takes precedence, otherwise notify about the bridge. I've explicitly marked the locations where the notification should be always for the port by setting bridge to NULL. I've also taken the liberty to rearrange each modified function's local variables in reverse xmas tree as well. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br.c | 2 +- net/bridge/br_if.c | 4 ++-- net/bridge/br_ioctl.c | 2 +- net/bridge/br_netlink.c | 58 ++++++++++++++++++++++++++--------------------- net/bridge/br_private.h | 3 ++- net/bridge/br_stp.c | 6 ++--- net/bridge/br_stp_if.c | 4 ++-- net/bridge/br_stp_timer.c | 2 +- net/bridge/br_sysfs_if.c | 2 +- 9 files changed, 45 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 1407d1ba7577..6bf06e756df2 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -112,7 +112,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v /* Events that may cause spanning tree to refresh */ if (event == NETDEV_CHANGEADDR || event == NETDEV_UP || event == NETDEV_CHANGE || event == NETDEV_DOWN) - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); return NOTIFY_DONE; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index ae38547bbf91..9ba4ed65c52b 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -271,7 +271,7 @@ static void del_nbp(struct net_bridge_port *p) br_stp_disable_port(p); spin_unlock_bh(&br->lock); - br_ifinfo_notify(RTM_DELLINK, p); + br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) @@ -589,7 +589,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, br_stp_enable_port(p); spin_unlock_bh(&br->lock); - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 16d01a3ff33f..73b957fd639d 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -293,7 +293,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ret) { if (p) - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); else netdev_state_change(br->dev); } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 26aeb0b5cf30..67bae0f11c67 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -361,14 +361,14 @@ nla_put_failure: * Contains port and master info as well as carrier and bridge state. */ static int br_fill_ifinfo(struct sk_buff *skb, - struct net_bridge_port *port, + const struct net_bridge_port *port, u32 pid, u32 seq, int event, unsigned int flags, u32 filter_mask, const struct net_device *dev) { + u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_bridge *br; struct ifinfomsg *hdr; struct nlmsghdr *nlh; - u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; if (port) br = port->br; @@ -454,28 +454,36 @@ nla_put_failure: return -EMSGSIZE; } -/* - * Notify listeners of a change in port information - */ -void br_ifinfo_notify(int event, struct net_bridge_port *port) +/* Notify listeners of a change in bridge or port information */ +void br_ifinfo_notify(int event, const struct net_bridge *br, + const struct net_bridge_port *port) { - struct net *net; + u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; + struct net_device *dev; struct sk_buff *skb; int err = -ENOBUFS; - u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; + struct net *net; + u16 port_no = 0; - if (!port) + if (WARN_ON(!port && !br)) return; - net = dev_net(port->dev); - br_debug(port->br, "port %u(%s) event %d\n", - (unsigned int)port->port_no, port->dev->name, event); + if (port) { + dev = port->dev; + br = port->br; + port_no = port->port_no; + } else { + dev = br->dev; + } + + net = dev_net(dev); + br_debug(br, "port %u(%s) event %d\n", port_no, dev->name, event); - skb = nlmsg_new(br_nlmsg_size(port->dev, filter), GFP_ATOMIC); + skb = nlmsg_new(br_nlmsg_size(dev, filter), GFP_ATOMIC); if (skb == NULL) goto errout; - err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, port->dev); + err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev); if (err < 0) { /* -EMSGSIZE implies BUG in br_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -488,7 +496,6 @@ errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); } - /* * Dump information about all ports, in response to GETLINK */ @@ -809,10 +816,11 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) /* Change state and parameters on port. */ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { + struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); + struct nlattr *tb[IFLA_BRPORT_MAX + 1]; + struct net_bridge_port *p; struct nlattr *protinfo; struct nlattr *afspec; - struct net_bridge_port *p; - struct nlattr *tb[IFLA_BRPORT_MAX + 1]; bool changed = false; int err = 0; @@ -852,13 +860,11 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) changed = true; } - if (afspec) { - err = br_afspec((struct net_bridge *)netdev_priv(dev), p, - afspec, RTM_SETLINK, &changed); - } + if (afspec) + err = br_afspec(br, p, afspec, RTM_SETLINK, &changed); if (changed) - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, br, p); out: return err; } @@ -866,8 +872,9 @@ out: /* Delete port information */ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { - struct nlattr *afspec; + struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct net_bridge_port *p; + struct nlattr *afspec; bool changed = false; int err = 0; @@ -880,13 +887,12 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) if (!p && !(dev->priv_flags & IFF_EBRIDGE)) return -EINVAL; - err = br_afspec((struct net_bridge *)netdev_priv(dev), p, - afspec, RTM_DELLINK, &changed); + err = br_afspec(br, p, afspec, RTM_DELLINK, &changed); if (changed) /* Send RTM_NEWLINK because userspace * expects RTM_NEWLINK for vlan dels */ - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, br, p); return err; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 860e4afaf71a..40553d832b6e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1071,7 +1071,8 @@ extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) extern struct rtnl_link_ops br_link_ops; int br_netlink_init(void); void br_netlink_fini(void); -void br_ifinfo_notify(int event, struct net_bridge_port *port); +void br_ifinfo_notify(int event, const struct net_bridge *br, + const struct net_bridge_port *port); int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 8f56c2d1f1a7..b6941961a876 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -123,7 +123,7 @@ static void br_root_port_block(const struct net_bridge *br, (unsigned int) p->port_no, p->dev->name); br_set_state(p, BR_STATE_LISTENING); - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (br->forward_delay > 0) mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); @@ -403,7 +403,7 @@ static void br_make_blocking(struct net_bridge_port *p) br_topology_change_detection(p->br); br_set_state(p, BR_STATE_BLOCKING); - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); del_timer(&p->forward_delay_timer); } @@ -426,7 +426,7 @@ static void br_make_forwarding(struct net_bridge_port *p) else br_set_state(p, BR_STATE_LEARNING); - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (br->forward_delay != 0) mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 89110319ef0f..808e2b914015 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -96,7 +96,7 @@ void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); br_port_state_selection(p->br); - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); } /* called under bridge lock */ @@ -111,7 +111,7 @@ void br_stp_disable_port(struct net_bridge_port *p) p->topology_change_ack = 0; p->config_pending = 0; - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); del_timer(&p->message_age_timer); del_timer(&p->forward_delay_timer); diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 60b6fe277a8b..b54c1a331450 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -99,7 +99,7 @@ static void br_forward_delay_timer_expired(unsigned long arg) netif_carrier_on(br->dev); } rcu_read_lock(); - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); rcu_read_unlock(); spin_unlock(&br->lock); } diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 0a1fa9ccd8b7..0254c35b2bf0 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -280,7 +280,7 @@ static ssize_t brport_store(struct kobject *kobj, ret = brport_attr->store(p, val); spin_unlock_bh(&p->br->lock); if (!ret) { - br_ifinfo_notify(RTM_NEWLINK, p); + br_ifinfo_notify(RTM_NEWLINK, NULL, p); ret = count; } } -- cgit v1.2.3 From 44ae12a768b7212976a362c590075716a77e8f28 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Nov 2017 11:47:39 +0100 Subject: net: sched: move the can_offload check from binding phase to rule insertion phase This restores the original behaviour before the block callbacks were introduced. Allow the drivers to do binding of block always, no matter if the NETIF_F_HW_TC feature is on or off. Move the check to the block callback which is called for rule insertion. Reported-by: Alexander Duyck Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/dsa/slave.c | 3 +++ net/sched/cls_api.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 48b954a76b0d..9b75d0ac4092 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -799,6 +799,9 @@ static int dsa_slave_setup_tc_block_cb(enum tc_setup_type type, void *type_data, { struct net_device *dev = cb_priv; + if (!tc_can_offload(dev)) + return -EOPNOTSUPP; + switch (type) { case TC_SETUP_CLSMATCHALL: return dsa_slave_setup_tc_cls_matchall(dev, type_data, ingress); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2c03fcbc7188..15e3216ef25d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -256,7 +256,7 @@ static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q, struct net_device *dev = q->dev_queue->dev; struct tc_block_offload bo = {}; - if (!tc_can_offload(dev)) + if (!dev->netdev_ops->ndo_setup_tc) return; bo.command = command; bo.binder_type = ei->binder_type; -- cgit v1.2.3 From 7612fb0387d6ffcfc3173527466fe3f596657c58 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Nov 2017 11:47:40 +0100 Subject: net: sched: remove tc_can_offload check from egdev call Since the only user, mlx5 driver does the check in mlx5e_setup_tc_block_cb, no need to check here. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 15e3216ef25d..a26c690b48ac 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1206,7 +1206,7 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, if (!a->ops->get_dev) continue; dev = a->ops->get_dev(a); - if (!dev || !tc_can_offload(dev)) + if (!dev) continue; ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); if (ret < 0) -- cgit v1.2.3 From 20acbd9a7aeee0b0af7107f3de791a52c949f3ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Nov 2017 15:06:08 +0000 Subject: rxrpc: Lock around calling a kernel service Rx notification Place a spinlock around the invocation of call->notify_rx() for a kernel service call and lock again when ending the call and replace the notification pointer with a pointer to a dummy function. This is required because it's possible for rxrpc_notify_socket() to be called after the call has been ended by the kernel service if called from the asynchronous work function rxrpc_process_call(). However, rxrpc_notify_socket() currently only holds the RCU read lock when invoking ->notify_rx(), which means that the afs_call struct would need to be disposed of by call_rcu() rather than by kfree(). But we shouldn't see any notifications from a call after calling rxrpc_kernel_end_call(), so a lock is required in rxrpc code. Without this, we may see the call wait queue as having a corrupt spinlock: BUG: spinlock bad magic on CPU#0, kworker/0:2/1612 general protection fault: 0000 [#1] SMP ... Workqueue: krxrpcd rxrpc_process_call task: ffff88040b83c400 task.stack: ffff88040adfc000 RIP: 0010:spin_bug+0x161/0x18f RSP: 0018:ffff88040adffcc0 EFLAGS: 00010002 RAX: 0000000000000032 RBX: 6b6b6b6b6b6b6b6b RCX: ffffffff81ab16cf RDX: ffff88041fa14c01 RSI: ffff88041fa0ccb8 RDI: ffff88041fa0ccb8 RBP: ffff88040adffcd8 R08: 00000000ffffffff R09: 00000000ffffffff R10: ffff88040adffc60 R11: 000000000000022c R12: ffff88040aca2208 R13: ffffffff81a58114 R14: 0000000000000000 R15: 0000000000000000 .... Call Trace: do_raw_spin_lock+0x1d/0x89 _raw_spin_lock_irqsave+0x3d/0x49 ? __wake_up_common_lock+0x4c/0xa7 __wake_up_common_lock+0x4c/0xa7 ? __lock_is_held+0x47/0x7a __wake_up+0xe/0x10 afs_wake_up_call_waiter+0x11b/0x122 [kafs] rxrpc_notify_socket+0x12b/0x258 rxrpc_process_call+0x18e/0x7d0 process_one_work+0x298/0x4de ? rescuer_thread+0x280/0x280 worker_thread+0x1d1/0x2ae ? rescuer_thread+0x280/0x280 kthread+0x12c/0x134 ? kthread_create_on_node+0x3a/0x3a ret_from_fork+0x27/0x40 In this case, note the corrupt data in EBX. The address of the offending afs_call is in R12, plus the offset to the spinlock. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 16 ++++++++++++++++ net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_object.c | 1 + net/rxrpc/recvmsg.c | 2 ++ 4 files changed, 20 insertions(+) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 344b2dcad52d..9b5c46b052fd 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -322,6 +322,14 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, } EXPORT_SYMBOL(rxrpc_kernel_begin_call); +/* + * Dummy function used to stop the notifier talking to recvmsg(). + */ +static void rxrpc_dummy_notify_rx(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ +} + /** * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using * @sock: The socket the call is on @@ -336,6 +344,14 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) mutex_lock(&call->user_mutex); rxrpc_release_call(rxrpc_sk(sock->sk), call); + + /* Make sure we're not going to call back into a kernel service */ + if (call->notify_rx) { + spin_lock_bh(&call->notify_lock); + call->notify_rx = rxrpc_dummy_notify_rx; + spin_unlock_bh(&call->notify_lock); + } + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_kernel); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ea5600b747cc..b2151993d384 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -525,6 +525,7 @@ struct rxrpc_call { unsigned long flags; unsigned long events; spinlock_t lock; + spinlock_t notify_lock; /* Kernel notification lock */ rwlock_t state_lock; /* lock for state transition */ u32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index fcdd6555a820..4c7fbc6dcce7 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -124,6 +124,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) INIT_LIST_HEAD(&call->sock_link); init_waitqueue_head(&call->waitq); spin_lock_init(&call->lock); + spin_lock_init(&call->notify_lock); rwlock_init(&call->state_lock); atomic_set(&call->usage, 1); call->debug_id = atomic_inc_return(&rxrpc_debug_id); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index e4937b3f3685..8510a98b87e1 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -40,7 +40,9 @@ void rxrpc_notify_socket(struct rxrpc_call *call) sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { + spin_lock_bh(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); + spin_unlock_bh(&call->notify_lock); } else { write_lock_bh(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { -- cgit v1.2.3 From 1457cc4cfb93511de347d1d0a1c9da3e826b66fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Nov 2017 15:06:55 +0000 Subject: rxrpc: Fix a null ptr deref in rxrpc_fill_out_ack() rxrpc_fill_out_ack() needs to be passed the connection pointer from its caller rather than using call->conn as the call may be disconnected in parallel with it, clearing call->conn, leading to: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: rxrpc_send_ack_packet+0x231/0x6a4 Signed-off-by: David Howells --- net/rxrpc/output.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 71e6f713fbe7..8ee8b2d4a3eb 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -35,7 +35,8 @@ struct rxrpc_abort_buffer { /* * Fill out an ACK packet. */ -static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, +static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, + struct rxrpc_call *call, struct rxrpc_ack_buffer *pkt, rxrpc_seq_t *_hard_ack, rxrpc_seq_t *_top, @@ -77,8 +78,8 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, } while (before_eq(seq, top)); } - mtu = call->conn->params.peer->if_mtu; - mtu -= call->conn->params.peer->hdrsize; + mtu = conn->params.peer->if_mtu; + mtu -= conn->params.peer->hdrsize; jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max; pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu); pkt->ackinfo.maxMTU = htonl(mtu); @@ -148,7 +149,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping) } call->ackr_reason = 0; } - n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top, reason); + n = rxrpc_fill_out_ack(conn, call, pkt, &hard_ack, &top, reason); spin_unlock_bh(&call->lock); -- cgit v1.2.3 From dcbefc30fbc2c1926bcecdd62579e3e107653d82 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Nov 2017 15:06:26 +0000 Subject: rxrpc: Fix call expiry handling Fix call expiry handling in the following ways (1) If all the request data from a client call is acked, don't send a follow up IDLE ACK with firstPacket == 1 and previousPacket == 0 as this appears to fool some servers into thinking everything has been accepted. (2) Never send an abort back to the server once it has ACK'd all the request packets; rather just try to reuse the channel for the next call. The first request DATA packet of the next call on the same channel will implicitly ACK the entire reply of the dead call - even if we haven't transmitted it yet. (3) Don't send RX_CALL_TIMEOUT in an ABORT packet, librx uses abort codes to pass local errors to the caller in addition to remote errors, and this is meant to be local only. The following also need to be addressed in future patches: (4) Service calls should send PING ACKs as 'keep alives' if the server is still processing the call. (5) VERSION REPLY packets should be sent to the peers of service connections to act as keep-alives. This is used to keep firewall routes in place. The AFS CM should enable this. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- net/rxrpc/input.c | 2 -- net/rxrpc/output.c | 10 ++++++++++ 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 7a77844aab16..3574508baf9a 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -386,7 +386,7 @@ recheck_state: now = ktime_get_real(); if (ktime_before(call->expire_at, now)) { - rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, -ETIME); + rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME); set_bit(RXRPC_CALL_EV_ABORT, &call->events); goto recheck_state; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 1e37eb1c0c66..1b592073ec96 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -298,8 +298,6 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, write_unlock(&call->state_lock); if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, true, - rxrpc_propose_ack_client_tx_end); trace_rxrpc_transmit(call, rxrpc_transmit_await_reply); } else { trace_rxrpc_transmit(call, rxrpc_transmit_end); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8ee8b2d4a3eb..f47659c7b224 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -222,6 +222,16 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) rxrpc_serial_t serial; int ret; + /* Don't bother sending aborts for a client call once the server has + * hard-ACK'd all of its request data. After that point, we're not + * going to stop the operation proceeding, and whilst we might limit + * the reply, it's not worth it if we can send a new call on the same + * channel instead, thereby closing off this call. + */ + if (rxrpc_is_client_call(call) && + test_bit(RXRPC_CALL_TX_LAST, &call->flags)) + return 0; + spin_lock_bh(&call->lock); if (call->conn) conn = rxrpc_get_connection_maybe(call->conn); -- cgit v1.2.3 From 47d3d7ac656a1ffb9d0f0d3c845663ed6fd7e78d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 30 Oct 2017 14:16:00 -0700 Subject: ipv6: Implement limits on Hop-by-Hop and Destination options RFC 8200 (IPv6) defines Hop-by-Hop options and Destination options extension headers. Both of these carry a list of TLVs which is only limited by the maximum length of the extension header (2048 bytes). By the spec a host must process all the TLVs in these options, however these could be used as a fairly obvious denial of service attack. I think this could in fact be a significant DOS vector on the Internet, one mitigating factor might be that many FWs drop all packets with EH (and obviously this is only IPv6) so an Internet wide attack might not be so effective (yet!). By my calculation, the worse case packet with TLVs in a standard 1500 byte MTU packet that would be processed by the stack contains 1282 invidual TLVs (including pad TLVS) or 724 two byte TLVs. I wrote a quick test program that floods a whole bunch of these packets to a host and sure enough there is substantial time spent in ip6_parse_tlv. These packets contain nothing but unknown TLVS (that are ignored), TLV padding, and bogus UDP header with zero payload length. 25.38% [kernel] [k] __fib6_clean_all 21.63% [kernel] [k] ip6_parse_tlv 4.21% [kernel] [k] __local_bh_enable_ip 2.18% [kernel] [k] ip6_pol_route.isra.39 1.98% [kernel] [k] fib6_walk_continue 1.88% [kernel] [k] _raw_write_lock_bh 1.65% [kernel] [k] dst_release This patch adds configurable limits to Destination and Hop-by-Hop options. There are three limits that may be set: - Limit the number of options in a Hop-by-Hop or Destination options extension header. - Limit the byte length of a Hop-by-Hop or Destination options extension header. - Disallow unrecognized options in a Hop-by-Hop or Destination options extension header. The limits are set in corresponding sysctls: ipv6.sysctl.max_dst_opts_cnt ipv6.sysctl.max_hbh_opts_cnt ipv6.sysctl.max_dst_opts_len ipv6.sysctl.max_hbh_opts_len If a max_*_opts_cnt is less than zero then unknown TLVs are disallowed. The number of known TLVs that are allowed is the absolute value of this number. If a limit is exceeded when processing an extension header the packet is dropped. Default values are set to 8 for options counts, and set to INT_MAX for maximum length. Note the choice to limit options to 8 is an arbitrary guess (roughly based on the fact that the stack supports three HBH options and just one destination option). These limits have being proposed in draft-ietf-6man-rfc6434-bis. Tested (by Martin Lau) I tested out 1 thread (i.e. one raw_udp process). I changed the net.ipv6.max_dst_(opts|hbh)_number between 8 to 2048. With sysctls setting to 2048, the softirq% is packed to 100%. With 8, the softirq% is almost unnoticable from mpstat. v2; - Code and documention cleanup. - Change references of RFC2460 to be RFC8200. - Add reference to RFC6434-bis where the limits will be in standard. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 4 +++ net/ipv6/exthdrs.c | 67 +++++++++++++++++++++++++++++++++++++--------- net/ipv6/sysctl_net_ipv6.c | 32 ++++++++++++++++++++++ 3 files changed, 91 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fe5262fd6aa5..c26f71234b9c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -810,6 +810,10 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.idgen_retries = 3; net->ipv6.sysctl.idgen_delay = 1 * HZ; net->ipv6.sysctl.flowlabel_state_ranges = 0; + net->ipv6.sysctl.max_dst_opts_cnt = IP6_DEFAULT_MAX_DST_OPTS_CNT; + net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT; + net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN; + net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN; atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 9f918a770f87..83bd75713535 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -74,8 +74,20 @@ struct tlvtype_proc { /* An unknown option is detected, decide what to do */ -static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) +static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff, + bool disallow_unknowns) { + if (disallow_unknowns) { + /* If unknown TLVs are disallowed by configuration + * then always silently drop packet. Note this also + * means no ICMP parameter problem is sent which + * could be a good property to mitigate a reflection DOS + * attack. + */ + + goto drop; + } + switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ return true; @@ -95,20 +107,30 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) return false; } +drop: kfree_skb(skb); return false; } /* Parse tlv encoded option header (hop-by-hop or destination) */ -static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) +static bool ip6_parse_tlv(const struct tlvtype_proc *procs, + struct sk_buff *skb, + int max_count) { - const struct tlvtype_proc *curr; + int len = (skb_transport_header(skb)[1] + 1) << 3; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); - int len = (skb_transport_header(skb)[1] + 1) << 3; + const struct tlvtype_proc *curr; + bool disallow_unknowns = false; + int tlv_count = 0; int padlen = 0; + if (unlikely(max_count < 0)) { + disallow_unknowns = true; + max_count = -max_count; + } + if (skb_transport_offset(skb) + len > skb_headlen(skb)) goto bad; @@ -149,6 +171,11 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) default: /* Other TLV code so scan list */ if (optlen > len) goto bad; + + tlv_count++; + if (tlv_count > max_count) + goto bad; + for (curr = procs; curr->type >= 0; curr++) { if (curr->type == nh[off]) { /* type specific length/alignment @@ -159,10 +186,10 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) break; } } - if (curr->type < 0) { - if (ip6_tlvopt_unknown(skb, off) == 0) - return false; - } + if (curr->type < 0 && + !ip6_tlvopt_unknown(skb, off, disallow_unknowns)) + return false; + padlen = 0; break; } @@ -258,23 +285,31 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) __u16 dstbuf; #endif struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(skb->dev); + int extlen; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); +fail_and_free: kfree_skb(skb); return -1; } + extlen = (skb_transport_header(skb)[1] + 1) << 3; + if (extlen > net->ipv6.sysctl.max_dst_opts_len) + goto fail_and_free; + opt->lastopt = opt->dst1 = skb_network_header_len(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) dstbuf = opt->dst1; #endif - if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { - skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + if (ip6_parse_tlv(tlvprocdestopt_lst, skb, + init_net.ipv6.sysctl.max_dst_opts_cnt)) { + skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) opt->nhoff = dstbuf; @@ -803,6 +838,8 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = { int ipv6_parse_hopopts(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(skb->dev); + int extlen; /* * skb_network_header(skb) is equal to skb->data, and @@ -813,13 +850,19 @@ int ipv6_parse_hopopts(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + ((skb_transport_header(skb)[1] + 1) << 3)))) { +fail_and_free: kfree_skb(skb); return -1; } + extlen = (skb_transport_header(skb)[1] + 1) << 3; + if (extlen > net->ipv6.sysctl.max_hbh_opts_len) + goto fail_and_free; + opt->flags |= IP6SKB_HOPBYHOP; - if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { - skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + if (ip6_parse_tlv(tlvprochopopt_lst, skb, + init_net.ipv6.sysctl.max_hbh_opts_cnt)) { + skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); return 1; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 6fbf8ae5e52c..4a2f0fd870bc 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -97,6 +97,34 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "max_dst_opts_number", + .data = &init_net.ipv6.sysctl.max_dst_opts_cnt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_hbh_opts_number", + .data = &init_net.ipv6.sysctl.max_hbh_opts_cnt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_dst_opts_length", + .data = &init_net.ipv6.sysctl.max_dst_opts_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_hbh_length", + .data = &init_net.ipv6.sysctl.max_hbh_opts_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -157,6 +185,10 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind; ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect; + ipv6_table[10].data = &net->ipv6.sysctl.max_dst_opts_cnt; + ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; + ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; + ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit v1.2.3 From cf34ce3da1e41579296364509266c7dac573822a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 30 Oct 2017 14:41:35 -0700 Subject: tcp: add tracepoint trace_tcp_retransmit_synack() This tracepoint can be used to trace synack retransmits. It maintains pointer to struct request_sock. We cannot simply reuse trace_tcp_retransmit_skb() here, because the sk here is the LISTEN socket. The IP addresses and ports should be extracted from struct request_sock. Note that, like many other tracepoints, this patch uses IS_ENABLED in TP_fast_assign macro, which triggers sparse warning like: ./include/trace/events/tcp.h:274:1: error: directive in argument list ./include/trace/events/tcp.h:281:1: error: directive in argument list However, there is no good solution to avoid these warnings. To the best of our knowledge, these warnings are harmless. Signed-off-by: Song Liu Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a85e8a282d17..06a0c89ffe40 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3782,6 +3782,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) tcp_sk(sk)->total_retrans++; + trace_tcp_retransmit_synack(sk, req); } return res; } -- cgit v1.2.3 From 054287295b1132c8742ea55f8e3af9cbd630c932 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Thu, 2 Nov 2017 10:36:48 +0100 Subject: net: Define eth_stp_addr in linux/etherdevice.h The lan9303 driver defines eth_stp_addr as a synonym to eth_reserved_addr_base to get the STP ethernet address 01:80:c2:00:00:00. eth_reserved_addr_base is also used to define the start of Bridge Reserved ethernet address range, which happen to be the STP address. br_dev_setup refer to eth_reserved_addr_base as a definition of STP address. Clean up by: - Move the eth_stp_addr definition to linux/etherdevice.h - Use eth_stp_addr instead of eth_reserved_addr_base in br_dev_setup. Signed-off-by: Egil Hjelmeland Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/bridge/br_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 28bb22186fa0..af5b8c87f590 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -421,7 +421,7 @@ void br_dev_setup(struct net_device *dev) br->bridge_id.prio[0] = 0x80; br->bridge_id.prio[1] = 0x00; - ether_addr_copy(br->group_addr, eth_reserved_addr_base); + ether_addr_copy(br->group_addr, eth_stp_addr); br->stp_enabled = BR_NO_STP; br->group_fwd_mask = BR_GROUPFWD_DEFAULT; -- cgit v1.2.3 From 3ae6ec08292f01c6782d1a80be0b2cc675e0ecfc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 2 Nov 2017 17:14:05 +0100 Subject: ipv4: Send a netevent whenever multipath hash policy is changed Devices performing IPv4 forwarding need to update their multipath hash policy whenever it is changed. Inform these devices by generating a netevent. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4602af6d5358..8dcc2b185fcc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -25,6 +25,7 @@ #include #include #include +#include static int zero; static int one = 1; @@ -385,6 +386,23 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH +static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_fib_multipath_hash_policy); + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + + return ret; +} +#endif + static struct ctl_table ipv4_table[] = { { .procname = "tcp_max_orphans", @@ -907,7 +925,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_fib_multipath_hash_policy, .extra1 = &zero, .extra2 = &one, }, -- cgit v1.2.3 From 88c1f37f05f30d74f56aec1f3bc10d238c9ba152 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Fri, 3 Nov 2017 11:51:10 +0530 Subject: net: bridge: Convert timers to use timer_setup() switch to using the new timer_setup() and from_timer() api's. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 79 +++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 7947e0436e18..5f7f0e9d446c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -239,9 +239,9 @@ static void br_multicast_free_group(struct rcu_head *head) kfree(mp); } -static void br_multicast_group_expired(unsigned long data) +static void br_multicast_group_expired(struct timer_list *t) { - struct net_bridge_mdb_entry *mp = (void *)data; + struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer); struct net_bridge *br = mp->br; struct net_bridge_mdb_htable *mdb; @@ -302,9 +302,9 @@ static void br_multicast_del_pg(struct net_bridge *br, WARN_ON(1); } -static void br_multicast_port_group_expired(unsigned long data) +static void br_multicast_port_group_expired(struct timer_list *t) { - struct net_bridge_port_group *pg = (void *)data; + struct net_bridge_port_group *pg = from_timer(pg, t, timer); struct net_bridge *br = pg->port->br; spin_lock(&br->multicast_lock); @@ -701,8 +701,7 @@ rehash: mp->br = br; mp->addr = *group; - setup_timer(&mp->timer, br_multicast_group_expired, - (unsigned long)mp); + timer_setup(&mp->timer, br_multicast_group_expired, 0); hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]); mdb->size++; @@ -729,8 +728,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( p->flags = flags; rcu_assign_pointer(p->next, next); hlist_add_head(&p->mglist, &port->mglist); - setup_timer(&p->timer, br_multicast_port_group_expired, - (unsigned long)p); + timer_setup(&p->timer, br_multicast_port_group_expired, 0); if (src) memcpy(p->eth_addr, src, ETH_ALEN); @@ -843,9 +841,10 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, } #endif -static void br_multicast_router_expired(unsigned long data) +static void br_multicast_router_expired(struct timer_list *t) { - struct net_bridge_port *port = (void *)data; + struct net_bridge_port *port = + from_timer(port, t, multicast_router_timer); struct net_bridge *br = port->br; spin_lock(&br->multicast_lock); @@ -872,9 +871,9 @@ static void br_mc_router_state_change(struct net_bridge *p, switchdev_port_attr_set(p->dev, &attr); } -static void br_multicast_local_router_expired(unsigned long data) +static void br_multicast_local_router_expired(struct timer_list *t) { - struct net_bridge *br = (struct net_bridge *)data; + struct net_bridge *br = from_timer(br, t, multicast_router_timer); spin_lock(&br->multicast_lock); if (br->multicast_router == MDB_RTR_TYPE_DISABLED || @@ -900,17 +899,17 @@ out: spin_unlock(&br->multicast_lock); } -static void br_ip4_multicast_querier_expired(unsigned long data) +static void br_ip4_multicast_querier_expired(struct timer_list *t) { - struct net_bridge *br = (void *)data; + struct net_bridge *br = from_timer(br, t, ip4_other_query.timer); br_multicast_querier_expired(br, &br->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_querier_expired(unsigned long data) +static void br_ip6_multicast_querier_expired(struct timer_list *t) { - struct net_bridge *br = (void *)data; + struct net_bridge *br = from_timer(br, t, ip6_other_query.timer); br_multicast_querier_expired(br, &br->ip6_own_query); } @@ -1011,17 +1010,17 @@ out: spin_unlock(&br->multicast_lock); } -static void br_ip4_multicast_port_query_expired(unsigned long data) +static void br_ip4_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_port *port = (void *)data; + struct net_bridge_port *port = from_timer(port, t, ip4_own_query.timer); br_multicast_port_query_expired(port, &port->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_port_query_expired(unsigned long data) +static void br_ip6_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_port *port = (void *)data; + struct net_bridge_port *port = from_timer(port, t, ip6_own_query.timer); br_multicast_port_query_expired(port, &port->ip6_own_query); } @@ -1043,13 +1042,13 @@ int br_multicast_add_port(struct net_bridge_port *port) { port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - setup_timer(&port->multicast_router_timer, br_multicast_router_expired, - (unsigned long)port); - setup_timer(&port->ip4_own_query.timer, - br_ip4_multicast_port_query_expired, (unsigned long)port); + timer_setup(&port->multicast_router_timer, + br_multicast_router_expired, 0); + timer_setup(&port->ip4_own_query.timer, + br_ip4_multicast_port_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) - setup_timer(&port->ip6_own_query.timer, - br_ip6_multicast_port_query_expired, (unsigned long)port); + timer_setup(&port->ip6_own_query.timer, + br_ip6_multicast_port_query_expired, 0); #endif br_mc_disabled_update(port->dev, port->br->multicast_disabled); @@ -1933,17 +1932,17 @@ static void br_multicast_query_expired(struct net_bridge *br, spin_unlock(&br->multicast_lock); } -static void br_ip4_multicast_query_expired(unsigned long data) +static void br_ip4_multicast_query_expired(struct timer_list *t) { - struct net_bridge *br = (void *)data; + struct net_bridge *br = from_timer(br, t, ip4_own_query.timer); br_multicast_query_expired(br, &br->ip4_own_query, &br->ip4_querier); } #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_query_expired(unsigned long data) +static void br_ip6_multicast_query_expired(struct timer_list *t) { - struct net_bridge *br = (void *)data; + struct net_bridge *br = from_timer(br, t, ip6_own_query.timer); br_multicast_query_expired(br, &br->ip6_own_query, &br->ip6_querier); } @@ -1978,17 +1977,17 @@ void br_multicast_init(struct net_bridge *br) br->has_ipv6_addr = 1; spin_lock_init(&br->multicast_lock); - setup_timer(&br->multicast_router_timer, - br_multicast_local_router_expired, (unsigned long)br); - setup_timer(&br->ip4_other_query.timer, - br_ip4_multicast_querier_expired, (unsigned long)br); - setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired, - (unsigned long)br); + timer_setup(&br->multicast_router_timer, + br_multicast_local_router_expired, 0); + timer_setup(&br->ip4_other_query.timer, + br_ip4_multicast_querier_expired, 0); + timer_setup(&br->ip4_own_query.timer, + br_ip4_multicast_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) - setup_timer(&br->ip6_other_query.timer, - br_ip6_multicast_querier_expired, (unsigned long)br); - setup_timer(&br->ip6_own_query.timer, br_ip6_multicast_query_expired, - (unsigned long)br); + timer_setup(&br->ip6_other_query.timer, + br_ip6_multicast_querier_expired, 0); + timer_setup(&br->ip6_own_query.timer, + br_ip6_multicast_query_expired, 0); #endif } -- cgit v1.2.3 From 1a3deb11d66ec4509462bbe9f162fc4bb991b061 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Fri, 3 Nov 2017 11:51:11 +0530 Subject: net: bridge: Convert timers to use timer_setup() switch to using the new timer_setup() and from_timer() api's. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/bridge/br_stp_timer.c | 48 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index b54c1a331450..e7739de5f0e1 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -31,9 +31,9 @@ static int br_is_designated_for_some_port(const struct net_bridge *br) return 0; } -static void br_hello_timer_expired(unsigned long arg) +static void br_hello_timer_expired(struct timer_list *t) { - struct net_bridge *br = (struct net_bridge *)arg; + struct net_bridge *br = from_timer(br, t, hello_timer); br_debug(br, "hello timer expired\n"); spin_lock(&br->lock); @@ -47,9 +47,9 @@ static void br_hello_timer_expired(unsigned long arg) spin_unlock(&br->lock); } -static void br_message_age_timer_expired(unsigned long arg) +static void br_message_age_timer_expired(struct timer_list *t) { - struct net_bridge_port *p = (struct net_bridge_port *) arg; + struct net_bridge_port *p = from_timer(p, t, message_age_timer); struct net_bridge *br = p->br; const bridge_id *id = &p->designated_bridge; int was_root; @@ -80,9 +80,9 @@ static void br_message_age_timer_expired(unsigned long arg) spin_unlock(&br->lock); } -static void br_forward_delay_timer_expired(unsigned long arg) +static void br_forward_delay_timer_expired(struct timer_list *t) { - struct net_bridge_port *p = (struct net_bridge_port *) arg; + struct net_bridge_port *p = from_timer(p, t, forward_delay_timer); struct net_bridge *br = p->br; br_debug(br, "port %u(%s) forward delay timer\n", @@ -104,9 +104,9 @@ static void br_forward_delay_timer_expired(unsigned long arg) spin_unlock(&br->lock); } -static void br_tcn_timer_expired(unsigned long arg) +static void br_tcn_timer_expired(struct timer_list *t) { - struct net_bridge *br = (struct net_bridge *) arg; + struct net_bridge *br = from_timer(br, t, tcn_timer); br_debug(br, "tcn timer expired\n"); spin_lock(&br->lock); @@ -118,9 +118,9 @@ static void br_tcn_timer_expired(unsigned long arg) spin_unlock(&br->lock); } -static void br_topology_change_timer_expired(unsigned long arg) +static void br_topology_change_timer_expired(struct timer_list *t) { - struct net_bridge *br = (struct net_bridge *) arg; + struct net_bridge *br = from_timer(br, t, topology_change_timer); br_debug(br, "topo change timer expired\n"); spin_lock(&br->lock); @@ -129,9 +129,9 @@ static void br_topology_change_timer_expired(unsigned long arg) spin_unlock(&br->lock); } -static void br_hold_timer_expired(unsigned long arg) +static void br_hold_timer_expired(struct timer_list *t) { - struct net_bridge_port *p = (struct net_bridge_port *) arg; + struct net_bridge_port *p = from_timer(p, t, hold_timer); br_debug(p->br, "port %u(%s) hold timer expired\n", (unsigned int) p->port_no, p->dev->name); @@ -144,27 +144,17 @@ static void br_hold_timer_expired(unsigned long arg) void br_stp_timer_init(struct net_bridge *br) { - setup_timer(&br->hello_timer, br_hello_timer_expired, - (unsigned long) br); - - setup_timer(&br->tcn_timer, br_tcn_timer_expired, - (unsigned long) br); - - setup_timer(&br->topology_change_timer, - br_topology_change_timer_expired, - (unsigned long) br); + timer_setup(&br->hello_timer, br_hello_timer_expired, 0); + timer_setup(&br->tcn_timer, br_tcn_timer_expired, 0); + timer_setup(&br->topology_change_timer, + br_topology_change_timer_expired, 0); } void br_stp_port_timer_init(struct net_bridge_port *p) { - setup_timer(&p->message_age_timer, br_message_age_timer_expired, - (unsigned long) p); - - setup_timer(&p->forward_delay_timer, br_forward_delay_timer_expired, - (unsigned long) p); - - setup_timer(&p->hold_timer, br_hold_timer_expired, - (unsigned long) p); + timer_setup(&p->message_age_timer, br_message_age_timer_expired, 0); + timer_setup(&p->forward_delay_timer, br_forward_delay_timer_expired, 0); + timer_setup(&p->hold_timer, br_hold_timer_expired, 0); } /* Report ticks left (in USER_HZ) used for API */ -- cgit v1.2.3 From 5a6d80034471d4407052c4bf3758071df5cadf33 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 2 Nov 2017 11:15:28 +0000 Subject: net/ncsi: Make local function ncsi_get_filter() static Fixes the following sparse warnings: net/ncsi/ncsi-manage.c:41:5: warning: symbol 'ncsi_get_filter' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ncsi/ncsi-manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 28c42b22b748..47baf914eec2 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -38,7 +38,7 @@ static inline int ncsi_filter_size(int table) return sizes[table]; } -u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index) +static u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index) { struct ncsi_channel_filter *ncf; int size; -- cgit v1.2.3 From 4bb1b116b7f37a64c08d28213a2e6f87fcef2d8b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 2 Nov 2017 15:07:01 +0100 Subject: net: sched: move block offload unbind after all chains are flushed Currently, the offload unbind is done before the chains are flushed. That causes driver to unregister block callback before it can get all the callback calls done during flush, leaving the offloaded tps inside the HW. So fix the order to prevent this situation and restore the original behaviour. Reported-by: Alexander Duyck Reported-by: Jakub Kicinski Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a26c690b48ac..3364347bc699 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -343,11 +343,11 @@ void tcf_block_put_ext(struct tcf_block *block, if (!block) return; - tcf_block_offload_unbind(block, q, ei); - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_flush(chain); + tcf_block_offload_unbind(block, q, ei); + INIT_WORK(&block->work, tcf_block_put_final); /* Wait for existing RCU callbacks to cool down, make sure their works * have been queued before this. We can not flush pending works here -- cgit v1.2.3 From fa36882682db0692ecbea20f859180f978923d72 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 2 Nov 2017 15:44:15 +0100 Subject: tipc: eliminate unnecessary probing The neighbor monitor employs a threshold, default set to 32 peer nodes, where it activates the "Overlapping Neighbor Monitoring" algorithm. Below that threshold, monitoring is full-mesh, and no "domain records" are passed between the nodes. Because of this, a node never received a peer's ack that it has received the most recent update of the own domain. Hence, the field 'acked_gen' in struct tipc_monitor_state remains permamently at zero, whereas the own domain generation is incremented for each added or removed peer. This has the effect that the function tipc_mon_get_state() always sets the field 'probing' in struct tipc_monitor_state true, again leading the tipc_link_timeout() of the link in question to always send out a probe, even when link->silent_intv_count is zero. This is functionally harmless, but leads to some unncessary probing, which can easily be eliminated by setting the 'probing' field of the said struct correctly in such cases. At the same time, we explictly invalidate the sent domain records when the algorithm is not activated. This will eliminate any risk that an invalid domain record might be inadverently accepted by the peer. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/monitor.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index b9c32557d73c..8e884ed06d4b 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -530,8 +530,11 @@ void tipc_mon_prep(struct net *net, void *data, int *dlen, u16 gen = mon->dom_gen; u16 len; - if (!tipc_mon_is_active(net, mon)) + /* Send invalid record if not active */ + if (!tipc_mon_is_active(net, mon)) { + dom->len = 0; return; + } /* Send only a dummy record with ack if peer has acked our last sent */ if (likely(state->acked_gen == gen)) { @@ -559,6 +562,12 @@ void tipc_mon_get_state(struct net *net, u32 addr, struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; + if (!tipc_mon_is_active(net, mon)) { + state->probing = false; + state->monitoring = true; + return; + } + /* Used cached state if table has not changed */ if (!state->probing && (state->list_gen == mon->list_gen) && -- cgit v1.2.3 From 9eba9353388da7bfac9694dfe94c15365d49ebd6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Nov 2017 11:53:04 -0700 Subject: tcp: fix a lockdep issue in tcp_fastopen_reset_cipher() icsk_accept_queue.fastopenq.lock is only fully initialized at listen() time. LOCKDEP is not happy if we attempt a spin_lock_bh() on it, because of missing annotation. (Although kernel runs just fine) Lets use net->ipv4.tcp_fastopen_ctx_lock to protect ctx access. Fixes: 1fba70e5b6be ("tcp: socket option to set TCP fast open key") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Christoph Paasch Reviewed-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e0a4b56644aa..91762be58acc 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -92,20 +92,18 @@ error: kfree(ctx); memcpy(ctx->key, key, len); + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; - spin_lock_bh(&q->lock); octx = rcu_dereference_protected(q->ctx, - lockdep_is_held(&q->lock)); + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); rcu_assign_pointer(q->ctx, ctx); - spin_unlock_bh(&q->lock); } else { - spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); - spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); } + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); -- cgit v1.2.3 From 53b3847be5cbdf819cf366ea9160369ff00dcf8e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Nov 2017 20:04:12 +0000 Subject: net: sched: cls_bpf: use bitwise & rather than logical && on gen_flags Currently gen_flags is being operated on by a logical && operator rather than a bitwise & operator. This looks incorrect as these should be bit flag operations. Fix this. Detected by CoverityScan, CID#1460305 ("Logical vs. bitwise operator") Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload) Signed-off-by: Colin Ian King Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5f701c8670a2..bc3edde1b9d7 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -174,7 +174,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, } } - if (addorrep && skip_sw && !(prog->gen_flags && TCA_CLS_FLAGS_IN_HW)) + if (addorrep && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; -- cgit v1.2.3 From f67971e683e81d7ba4739728511ae6e52a1b6321 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Nov 2017 18:10:03 -0700 Subject: tcp: tcp_fragment() should not assume rtx skbs While stress testing MTU probing, we had crashes in list_del() that we root-caused to the fact that tcp_fragment() is unconditionally inserting the freshly allocated skb into tsorted_sent_queue list. But this list is supposed to contain skbs that were sent. This was mostly harmless until MTU probing was enabled. Fortunately we can use the tcp_queue enum added later (but in same linux version) for rtx-rb-tree to fix the bug. Fixes: e2080072ed2d ("tcp: new list for sent but unacked skbs for RACK recovery") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Soheil Hassas Yeganeh Cc: Alexei Starovoitov Cc: Priyaranjan Jha Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 06a0c89ffe40..822962ece284 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1395,7 +1395,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); - list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); + if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) + list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } -- cgit v1.2.3 From 8f918d3ff4a1283a1693afe2b4c8e1844674ca15 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Nov 2017 17:32:08 -0700 Subject: net_sched: check NULL in tcf_block_put() Callers of tcf_block_put() could pass NULL so we can't use block->q before checking if block is NULL or not. tcf_block_put_ext() callers are fine, it is always non-NULL. Fixes: 8c4083b30e56 ("net: sched: add block bind/unbind notif. and extended block_get/put") Reported-by: Dave Taht Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 3364347bc699..8d1885abee83 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -340,9 +340,6 @@ void tcf_block_put_ext(struct tcf_block *block, { struct tcf_chain *chain, *tmp; - if (!block) - return; - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_flush(chain); @@ -362,6 +359,8 @@ void tcf_block_put(struct tcf_block *block) { struct tcf_block_ext_info ei = {0, }; + if (!block) + return; tcf_block_put_ext(block, NULL, block->q, &ei); } -- cgit v1.2.3 From c7eb7d7230509ec862d4144f7a831f995bc5d028 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 3 Nov 2017 11:46:24 +0100 Subject: net: sched: introduce chain_head_change callback Add a callback that is to be called whenever head of the chain changes. Also provide a callback for the default case when the caller gets a block using non-extended getter. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 54 +++++++++++++++++++++++++++++-------------------- net/sched/sch_ingress.c | 36 ++++++++++++++++++--------------- 2 files changed, 52 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8d1885abee83..206e19f4fc01 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -195,12 +195,19 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, return chain; } +static void tcf_chain_head_change(struct tcf_chain *chain, + struct tcf_proto *tp_head) +{ + if (chain->chain_head_change) + chain->chain_head_change(tp_head, + chain->chain_head_change_priv); +} + static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp; - if (chain->p_filter_chain) - RCU_INIT_POINTER(*chain->p_filter_chain, NULL); + tcf_chain_head_change(chain, NULL); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_chain_put(chain); @@ -242,13 +249,6 @@ void tcf_chain_put(struct tcf_chain *chain) } EXPORT_SYMBOL(tcf_chain_put); -static void -tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain, - struct tcf_proto __rcu **p_filter_chain) -{ - chain->p_filter_chain = p_filter_chain; -} - static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei, enum tc_block_command command) @@ -276,8 +276,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND); } -int tcf_block_get_ext(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei) { struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); @@ -295,7 +294,9 @@ int tcf_block_get_ext(struct tcf_block **p_block, err = -ENOMEM; goto err_chain_create; } - tcf_chain_filter_chain_ptr_set(chain, p_filter_chain); + WARN_ON(!ei->chain_head_change); + chain->chain_head_change = ei->chain_head_change; + chain->chain_head_change_priv = ei->chain_head_change_priv; block->net = qdisc_net(q); block->q = q; tcf_block_offload_bind(block, q, ei); @@ -308,12 +309,23 @@ err_chain_create: } EXPORT_SYMBOL(tcf_block_get_ext); +static void tcf_chain_head_change_dflt(struct tcf_proto *tp_head, void *priv) +{ + struct tcf_proto __rcu **p_filter_chain = priv; + + rcu_assign_pointer(*p_filter_chain, tp_head); +} + int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) { - struct tcf_block_ext_info ei = {0, }; + struct tcf_block_ext_info ei = { + .chain_head_change = tcf_chain_head_change_dflt, + .chain_head_change_priv = p_filter_chain, + }; - return tcf_block_get_ext(p_block, p_filter_chain, q, &ei); + WARN_ON(!p_filter_chain); + return tcf_block_get_ext(p_block, q, &ei); } EXPORT_SYMBOL(tcf_block_get); @@ -334,8 +346,7 @@ static void tcf_block_put_final(struct work_struct *work) * actions should be all removed after flushing. However, filters are now * destroyed in tc filter workqueue with RTNL lock, they can not race here. */ -void tcf_block_put_ext(struct tcf_block *block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { struct tcf_chain *chain, *tmp; @@ -361,7 +372,7 @@ void tcf_block_put(struct tcf_block *block) if (!block) return; - tcf_block_put_ext(block, NULL, block->q, &ei); + tcf_block_put_ext(block, block->q, &ei); } EXPORT_SYMBOL(tcf_block_put); @@ -537,9 +548,8 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain, struct tcf_chain_info *chain_info, struct tcf_proto *tp) { - if (chain->p_filter_chain && - *chain_info->pprev == chain->filter_chain) - rcu_assign_pointer(*chain->p_filter_chain, tp); + if (*chain_info->pprev == chain->filter_chain) + tcf_chain_head_change(chain, tp); RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); tcf_chain_hold(chain); @@ -551,8 +561,8 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, { struct tcf_proto *next = rtnl_dereference(chain_info->next); - if (chain->p_filter_chain && tp == chain->filter_chain) - RCU_INIT_POINTER(*chain->p_filter_chain, next); + if (tp == chain->filter_chain) + tcf_chain_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); tcf_chain_put(chain); } diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index b599db26d34b..811845815b8c 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -54,6 +54,13 @@ static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl) return q->block; } +static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv) +{ + struct tcf_proto __rcu **p_filter_chain = priv; + + rcu_assign_pointer(*p_filter_chain, tp_head); +} + static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { struct ingress_sched_data *q = qdisc_priv(sch); @@ -61,9 +68,10 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) int err; q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + q->block_info.chain_head_change = clsact_chain_head_change; + q->block_info.chain_head_change_priv = &dev->ingress_cl_list; - err = tcf_block_get_ext(&q->block, &dev->ingress_cl_list, - sch, &q->block_info); + err = tcf_block_get_ext(&q->block, sch, &q->block_info); if (err) return err; @@ -76,10 +84,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); - tcf_block_put_ext(q->block, &dev->ingress_cl_list, - sch, &q->block_info); + tcf_block_put_ext(q->block, sch, &q->block_info); net_dec_ingress_queue(); } @@ -162,16 +168,18 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) int err; q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + q->ingress_block_info.chain_head_change = clsact_chain_head_change; + q->ingress_block_info.chain_head_change_priv = &dev->ingress_cl_list; - err = tcf_block_get_ext(&q->ingress_block, &dev->ingress_cl_list, - sch, &q->ingress_block_info); + err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info); if (err) return err; q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS; + q->egress_block_info.chain_head_change = clsact_chain_head_change; + q->egress_block_info.chain_head_change_priv = &dev->egress_cl_list; - err = tcf_block_get_ext(&q->egress_block, &dev->egress_cl_list, - sch, &q->egress_block_info); + err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); if (err) goto err_egress_block_get; @@ -183,20 +191,16 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) return 0; err_egress_block_get: - tcf_block_put_ext(q->ingress_block, &dev->ingress_cl_list, - sch, &q->ingress_block_info); + tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); return err; } static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); - tcf_block_put_ext(q->egress_block, &dev->egress_cl_list, - sch, &q->egress_block_info); - tcf_block_put_ext(q->ingress_block, &dev->ingress_cl_list, - sch, &q->ingress_block_info); + tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); + tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); net_dec_ingress_queue(); net_dec_egress_queue(); -- cgit v1.2.3 From 46209401f8f6116bd0b2c2d14a63958e83ffca0b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 3 Nov 2017 11:46:25 +0100 Subject: net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath In sch_handle_egress and sch_handle_ingress tp->q is used only in order to update stats. So stats and filter list are the only things that are needed in clsact qdisc fastpath processing. Introduce new mini_Qdisc struct to hold those items. Also, introduce a helper to swap the mini_Qdisc structures in case filter list head changes. This removes need for tp->q usage without added overhead. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 21 +++++++++++---------- net/sched/sch_generic.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ net/sched/sch_ingress.c | 19 ++++++++++++++----- 3 files changed, 71 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 24ac9083bc13..1423cf4d695c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3274,22 +3274,22 @@ EXPORT_SYMBOL(dev_loopback_xmit); static struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { - struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); + struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); struct tcf_result cl_res; - if (!cl) + if (!miniq) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ - qdisc_bstats_cpu_update(cl->q, skb); + mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); break; case TC_ACT_SHOT: - qdisc_qstats_cpu_drop(cl->q); + mini_qdisc_qstats_cpu_drop(miniq); *ret = NET_XMIT_DROP; kfree_skb(skb); return NULL; @@ -4189,7 +4189,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { #ifdef CONFIG_NET_CLS_ACT - struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); + struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); struct tcf_result cl_res; /* If there's at least one ingress present somewhere (so @@ -4197,8 +4197,9 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, * that are not configured with an ingress qdisc will bail * out here. */ - if (!cl) + if (!miniq) return skb; + if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; @@ -4206,15 +4207,15 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, qdisc_skb_cb(skb)->pkt_len = skb->len; skb->tc_at_ingress = 1; - qdisc_bstats_cpu_update(cl->q, skb); + mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); break; case TC_ACT_SHOT: - qdisc_qstats_cpu_drop(cl->q); + mini_qdisc_qstats_cpu_drop(miniq); kfree_skb(skb); return NULL; case TC_ACT_STOLEN: diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index aa74aa42b5d7..3839cbbdc32b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1024,3 +1024,49 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, } } EXPORT_SYMBOL(psched_ratecfg_precompute); + +static void mini_qdisc_rcu_func(struct rcu_head *head) +{ +} + +void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, + struct tcf_proto *tp_head) +{ + struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq); + struct mini_Qdisc *miniq; + + if (!tp_head) { + RCU_INIT_POINTER(*miniqp->p_miniq, NULL); + return; + } + + miniq = !miniq_old || miniq_old == &miniqp->miniq2 ? + &miniqp->miniq1 : &miniqp->miniq2; + + /* We need to make sure that readers won't see the miniq + * we are about to modify. So wait until previous call_rcu_bh callback + * is done. + */ + rcu_barrier_bh(); + miniq->filter_list = tp_head; + rcu_assign_pointer(*miniqp->p_miniq, miniq); + + if (miniq_old) + /* This is counterpart of the rcu barrier above. We need to + * block potential new user of miniq_old until all readers + * are not seeing it. + */ + call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func); +} +EXPORT_SYMBOL(mini_qdisc_pair_swap); + +void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, + struct mini_Qdisc __rcu **p_miniq) +{ + miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats; + miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; + miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; + miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; + miniqp->p_miniq = p_miniq; +} +EXPORT_SYMBOL(mini_qdisc_pair_init); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 811845815b8c..5ecc38f35d47 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -21,6 +21,7 @@ struct ingress_sched_data { struct tcf_block *block; struct tcf_block_ext_info block_info; + struct mini_Qdisc_pair miniqp; }; static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) @@ -56,9 +57,9 @@ static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl) static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv) { - struct tcf_proto __rcu **p_filter_chain = priv; + struct mini_Qdisc_pair *miniqp = priv; - rcu_assign_pointer(*p_filter_chain, tp_head); + mini_qdisc_pair_swap(miniqp, tp_head); } static int ingress_init(struct Qdisc *sch, struct nlattr *opt) @@ -67,9 +68,11 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); + q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->block_info.chain_head_change = clsact_chain_head_change; - q->block_info.chain_head_change_priv = &dev->ingress_cl_list; + q->block_info.chain_head_change_priv = &q->miniqp; err = tcf_block_get_ext(&q->block, sch, &q->block_info); if (err) @@ -128,6 +131,8 @@ struct clsact_sched_data { struct tcf_block *egress_block; struct tcf_block_ext_info ingress_block_info; struct tcf_block_ext_info egress_block_info; + struct mini_Qdisc_pair miniqp_ingress; + struct mini_Qdisc_pair miniqp_egress; }; static unsigned long clsact_find(struct Qdisc *sch, u32 classid) @@ -167,17 +172,21 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); + q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->ingress_block_info.chain_head_change = clsact_chain_head_change; - q->ingress_block_info.chain_head_change_priv = &dev->ingress_cl_list; + q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress; err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info); if (err) return err; + mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress); + q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS; q->egress_block_info.chain_head_change = clsact_chain_head_change; - q->egress_block_info.chain_head_change_priv = &dev->egress_cl_list; + q->egress_block_info.chain_head_change_priv = &q->miniqp_egress; err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); if (err) -- cgit v1.2.3 From 9d917c207d6b212eb5fb4d8ca31826e5b75b4d28 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 2 Nov 2017 15:49:08 +0300 Subject: add support of IFF_XMIT_DST_RELEASE bit in vlan Some time ago Eric Dumazet suggested a "hack the IFF_XMIT_DST_RELEASE flag on the vlan netdev". But the last comment was "does not support properly bonding/team.(If the real_dev->privflags IFF_XMIT_DST_RELEASE bit changes, we want to update all the vlans at the same time )" I've extended that patch to support changes of IFF_XMIT_DST_RELEASE in bonding/team. Both bonding and team call netdev_change_features() after recalculation of features including priv_flags IFF_XMIT_DST_RELEASE bit. So the only thing needed to support is to recheck this bit in vlan_transfer_features(). Suggested-by: Eric Dumazet Signed-off-by: Vadim Fedorenko Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 +++ net/8021q/vlan_netlink.c | 1 + 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 71c3e045505b..a0103500cc6d 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -328,6 +328,9 @@ static void vlan_transfer_features(struct net_device *dev, vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif + vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); + netdev_update_features(vlandev); } diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 6e7c5a6a7930..6689c0b272a7 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -143,6 +143,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, vlan->vlan_proto = proto; vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]); vlan->real_dev = real_dev; + dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlan->flags = VLAN_FLAG_REORDER_HDR; err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id); -- cgit v1.2.3 From 991a26af2e65e0422b2a899e373c0146f2436cae Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 2 Nov 2017 17:07:05 +0300 Subject: tcp_nv: use do_div() instead of expensive div64_u64() Average RTT is 32-bit thus full 64-bit division is redundant. Signed-off-by: Konstantin Khlebnikov Suggested-by: Stephen Hemminger Suggested-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_nv.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 0529e29e2941..0b5a05bd82e3 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -242,7 +242,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) struct tcp_sock *tp = tcp_sk(sk); struct tcpnv *ca = inet_csk_ca(sk); unsigned long now = jiffies; - s64 rate64 = 0; + u64 rate64; u32 rate, max_win, cwnd_by_slope; u32 avg_rtt; u32 bytes_acked = 0; @@ -284,8 +284,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) } /* rate in 100's bits per second */ - rate64 = ((u64)sample->in_flight) * 8000000; - rate = (u32)div64_u64(rate64, (u64)(avg_rtt ?: 1) * 100); + rate64 = ((u64)sample->in_flight) * 80000; + do_div(rate64, avg_rtt ?: 1); + rate = (u32)rate64; /* Remember the maximum rate seen during this RTT * Note: It may be more than one RTT. This function should be -- cgit v1.2.3 From 0f04d057515275099c6e4f767b95a278be4681bf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 3 Nov 2017 08:09:45 +0000 Subject: net: sched: cls_u32: use bitwise & rather than logical && on n->flags Currently n->flags is being operated on by a logical && operator rather than a bitwise & operator. This looks incorrect as these should be bit flag operations. Fix this. Detected by CoverityScan, CID#1460398 ("Logical vs. bitwise operator") Fixes: 245dc5121a9b ("net: sched: cls_u32: call block callbacks for offload") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 86145867b424..2737b71854c9 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -572,7 +572,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, n->flags |= TCA_CLS_FLAGS_IN_HW; } - if (skip_sw && !(n->flags && TCA_CLS_FLAGS_IN_HW)) + if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; -- cgit v1.2.3 From df7e8e2e3e59fe29c24f09d7c3b68e732d661af3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Nov 2017 08:27:14 -0700 Subject: pktgen: do not abuse IN6_ADDR_HSIZE pktgen accidentally used IN6_ADDR_HSIZE, instead of using the size of an IPv6 address. Since IN6_ADDR_HSIZE recently was increased from 16 to 256, this old bug is hitting us. Fixes: 3f27fb23219e ("ipv6: addrconf: add per netns perturbation in inet6_addr_hash()") Signed-off-by: Eric Dumazet Reported-by: Dan Carpenter Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 6e1e10ff433a..e3fa53a07d34 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2165,7 +2165,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) + pkt_dev->pkt_overhead; } - for (i = 0; i < IN6_ADDR_HSIZE; i++) + for (i = 0; i < sizeof(struct in6_addr); i++) if (pkt_dev->cur_in6_saddr.s6_addr[i]) { set = 1; break; -- cgit v1.2.3 From 27c565ae9d554fa1c00c799754cff43476c8d3b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Nov 2017 08:53:27 -0700 Subject: ipv6: remove IN6_ADDR_HSIZE from addrconf.h IN6_ADDR_HSIZE is private to addrconf.c, move it here to avoid confusion. Signed-off-by: Eric Dumazet Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 69b8cdb43aa2..66d8c3d912fd 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -157,6 +157,8 @@ static int ipv6_generate_stable_address(struct in6_addr *addr, u8 dad_count, const struct inet6_dev *idev); +#define IN6_ADDR_HSIZE_SHIFT 8 +#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) /* * Configured unicast address hash table */ -- cgit v1.2.3 From 7cbebc8a142238da3c2966f87b81ace491c8f089 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 2 Nov 2017 17:04:36 -0200 Subject: net: export peernet2id_alloc It will be used by openvswitch. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- net/core/net_namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6cfdc7c84c48..b797832565d3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -234,6 +234,7 @@ int peernet2id_alloc(struct net *net, struct net *peer) rtnl_net_notifyid(net, RTM_NEWNSID, id); return id; } +EXPORT_SYMBOL_GPL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ int peernet2id(struct net *net, struct net *peer) -- cgit v1.2.3 From 9354d452034273a50a4fd703bea31e5d6b1fc20b Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 2 Nov 2017 17:04:37 -0200 Subject: openvswitch: reliable interface indentification in port dumps This patch allows reliable identification of netdevice interfaces connected to openvswitch bridges. In particular, user space queries the netdev interfaces belonging to the ports for statistics, up/down state, etc. Datapath dump needs to provide enough information for the user space to be able to do that. Currently, only interface names are returned. This is not sufficient, as openvswitch allows its ports to be in different name spaces and the interface name is valid only in its name space. What is needed and generally used in other netlink APIs, is the pair ifindex+netnsid. The solution is addition of the ifindex+netnsid pair (or only ifindex if in the same name space) to vport get/dump operation. On request side, ideally the ifindex+netnsid pair could be used to get/set/del the corresponding vport. This is not implemented by this patch and can be added later if needed. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 47 ++++++++++++++++++++++++++++++++------------- net/openvswitch/datapath.h | 4 ++-- net/openvswitch/dp_notify.c | 4 ++-- 3 files changed, 38 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index c3aec6227c91..4d38ac044cee 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1848,7 +1848,8 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = { /* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, - u32 portid, u32 seq, u32 flags, u8 cmd) + struct net *net, u32 portid, u32 seq, + u32 flags, u8 cmd) { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; @@ -1864,9 +1865,17 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, - ovs_vport_name(vport))) + ovs_vport_name(vport)) || + nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; + if (!net_eq(net, dev_net(vport->dev))) { + int id = peernet2id_alloc(net, dev_net(vport->dev)); + + if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) + goto nla_put_failure; + } + ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), &vport_stats, @@ -1896,8 +1905,8 @@ static struct sk_buff *ovs_vport_cmd_alloc_info(void) } /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ -struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, - u32 seq, u8 cmd) +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, + u32 portid, u32 seq, u8 cmd) { struct sk_buff *skb; int retval; @@ -1906,7 +1915,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, if (!skb) return ERR_PTR(-ENOMEM); - retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd); + retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd); BUG_ON(retval < 0); return skb; @@ -1920,6 +1929,8 @@ static struct vport *lookup_vport(struct net *net, struct datapath *dp; struct vport *vport; + if (a[OVS_VPORT_ATTR_IFINDEX]) + return ERR_PTR(-EOPNOTSUPP); if (a[OVS_VPORT_ATTR_NAME]) { vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); if (!vport) @@ -1944,6 +1955,7 @@ static struct vport *lookup_vport(struct net *net, return vport; } else return ERR_PTR(-EINVAL); + } /* Called with ovs_mutex */ @@ -1983,6 +1995,8 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) return -EINVAL; + if (a[OVS_VPORT_ATTR_IFINDEX]) + return -EOPNOTSUPP; port_no = a[OVS_VPORT_ATTR_PORT_NO] ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; @@ -2032,8 +2046,9 @@ restart: goto exit_unlock_free; } - err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, - info->snd_seq, 0, OVS_VPORT_CMD_NEW); + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_NEW); if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom) update_headroom(dp); @@ -2090,8 +2105,9 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; } - err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, - info->snd_seq, 0, OVS_VPORT_CMD_NEW); + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_NEW); BUG_ON(err < 0); ovs_unlock(); @@ -2128,8 +2144,9 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; } - err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, - info->snd_seq, 0, OVS_VPORT_CMD_DEL); + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_DEL); BUG_ON(err < 0); /* the vport deletion may trigger dp headroom update */ @@ -2169,8 +2186,9 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; - err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, - info->snd_seq, 0, OVS_VPORT_CMD_NEW); + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_NEW); BUG_ON(err < 0); rcu_read_unlock(); @@ -2202,6 +2220,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { if (j >= skip && ovs_vport_cmd_fill_info(vport, skb, + sock_net(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, @@ -2228,6 +2247,8 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, + [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, }; static const struct genl_ops dp_vport_genl_ops[] = { diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 480600649d0b..4a104ef9e12c 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -200,8 +200,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *, uint32_t cutlen); const char *ovs_dp_name(const struct datapath *dp); -struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, - u8 cmd); +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, + u32 portid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *, struct sw_flow_key *); diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 653d073bae45..f3ee2f2825c0 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -30,8 +30,8 @@ static void dp_detach_port_notify(struct vport *vport) struct datapath *dp; dp = vport->dp; - notify = ovs_vport_cmd_build_info(vport, 0, 0, - OVS_VPORT_CMD_DEL); + notify = ovs_vport_cmd_build_info(vport, ovs_dp_get_net(dp), + 0, 0, OVS_VPORT_CMD_DEL); ovs_dp_detach_port(vport); if (IS_ERR(notify)) { genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, -- cgit v1.2.3 From 79e1ad148c844f5c8b9d76b36b26e3886dca95ae Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 2 Nov 2017 17:04:38 -0200 Subject: rtnetlink: use netnsid to query interface Currently, when an application gets netnsid from the kernel (for example as the result of RTM_GETLINK call on one end of the veth pair), it's not much useful. There's no reliable way to get to the netns fd from the netnsid, nor does any kernel API accept netnsid. Extend the RTM_GETLINK call to also accept netnsid. It will operate on the netns with the given netnsid in such case. Of course, the calling process needs to have enough capabilities in the target name space; for now, require CAP_NET_ADMIN. This can be relaxed in the future. To signal to the calling process that the kernel understood the new IFLA_IF_NETNSID attribute in the query, it will include it in the response. This is needed to detect older kernels, as they will just ignore IFLA_IF_NETNSID and query in the current name space. This patch implemetns IFLA_IF_NETNSID only for get and dump. For set operations, this can be extended later. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 103 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index de24d394c69e..8a8c51937edf 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -921,7 +921,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ - + + nla_total_size(4) /* IFLA_IF_NETNSID */ + + 0; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -1370,13 +1371,14 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, } static int rtnl_fill_link_netnsid(struct sk_buff *skb, - const struct net_device *dev) + const struct net_device *dev, + struct net *src_net) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id_alloc(dev_net(dev), link_net); + int id = peernet2id_alloc(src_net, link_net); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; @@ -1427,10 +1429,11 @@ static int rtnl_fill_link_af(struct sk_buff *skb, return 0; } -static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, +static int rtnl_fill_ifinfo(struct sk_buff *skb, + struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, - u32 event, int *new_nsid) + u32 event, int *new_nsid, int tgt_netnsid) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1448,6 +1451,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; + if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid)) + goto nla_put_failure; + if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, @@ -1513,7 +1519,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } - if (rtnl_fill_link_netnsid(skb, dev)) + if (rtnl_fill_link_netnsid(skb, dev, src_net)) goto nla_put_failure; if (new_nsid && @@ -1571,6 +1577,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, + [IFLA_IF_NETNSID] = { .type = NLA_S32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1674,9 +1681,28 @@ static bool link_dump_filtered(struct net_device *dev, return false; } +static struct net *get_target_net(struct sk_buff *skb, int netnsid) +{ + struct net *net; + + net = get_net_ns_by_id(sock_net(skb->sk), netnsid); + if (!net) + return ERR_PTR(-EINVAL); + + /* For now, the caller is required to have CAP_NET_ADMIN in + * the user namespace owning the target net ns. + */ + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EACCES); + } + return net; +} + static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + struct net *tgt_net = net; int h, s_h; int idx = 0, s_idx; struct net_device *dev; @@ -1686,6 +1712,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) const struct rtnl_link_ops *kind_ops = NULL; unsigned int flags = NLM_F_MULTI; int master_idx = 0; + int netnsid = -1; int err; int hdrlen; @@ -1704,6 +1731,15 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { + if (tb[IFLA_IF_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); + tgt_net = get_target_net(skb, netnsid); + if (IS_ERR(tgt_net)) { + tgt_net = net; + netnsid = -1; + } + } + if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); @@ -1719,17 +1755,19 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; - head = &net->dev_index_head[h]; + head = &tgt_net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (link_dump_filtered(dev, master_idx, kind_ops)) goto cont; if (idx < s_idx) goto cont; - err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + err = rtnl_fill_ifinfo(skb, dev, net, + RTM_NEWLINK, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0, NULL); + ext_filter_mask, 0, NULL, + netnsid); if (err < 0) { if (likely(skb->len)) @@ -1748,6 +1786,8 @@ out_err: cb->args[0] = h; cb->seq = net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + if (netnsid >= 0) + put_net(tgt_net); return err; } @@ -2360,6 +2400,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + if (tb[IFLA_IF_NETNSID]) + return -EOPNOTSUPP; + if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -2454,6 +2497,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (tb[IFLA_IF_NETNSID]) + return -EOPNOTSUPP; + if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); @@ -2585,6 +2631,9 @@ replay: if (err < 0) return err; + if (tb[IFLA_IF_NETNSID]) + return -EOPNOTSUPP; + if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -2818,11 +2867,13 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct net *tgt_net = net; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; + int netnsid = -1; int err; u32 ext_filter_mask = 0; @@ -2830,35 +2881,50 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (tb[IFLA_IF_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); + tgt_net = get_target_net(skb, netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = __dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + dev = __dev_get_by_name(tgt_net, ifname); else - return -EINVAL; + goto out; + err = -ENODEV; if (dev == NULL) - return -ENODEV; + goto out; + err = -ENOBUFS; nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); if (nskb == NULL) - return -ENOBUFS; + goto out; - err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL); + err = rtnl_fill_ifinfo(nskb, dev, net, + RTM_NEWLINK, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, 0, 0, ext_filter_mask, + 0, NULL, netnsid); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); +out: + if (netnsid >= 0) + put_net(tgt_net); return err; } @@ -2948,8 +3014,9 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event, - new_nsid); + err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), + type, 0, 0, change, 0, 0, event, + new_nsid, -1); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); -- cgit v1.2.3 From d0f36847016276920d860d5c089934ff3fea7e30 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Nov 2017 06:09:17 -0700 Subject: tcp: tcp_mtu_probing() cleanup Reduce one indentation level to make code more readable. tcp_sync_mss() can be factorized. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 035a1ef1f2d8..16df6dd44b98 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -107,26 +107,23 @@ static int tcp_orphan_retries(struct sock *sk, bool alive) static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) { - struct net *net = sock_net(sk); + const struct net *net = sock_net(sk); + int mss; /* Black hole detection */ - if (net->ipv4.sysctl_tcp_mtu_probing) { - if (!icsk->icsk_mtup.enabled) { - icsk->icsk_mtup.enabled = 1; - icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } else { - struct net *net = sock_net(sk); - struct tcp_sock *tp = tcp_sk(sk); - int mss; - - mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; - mss = min(net->ipv4.sysctl_tcp_base_mss, mss); - mss = max(mss, 68 - tp->tcp_header_len); - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } + if (!net->ipv4.sysctl_tcp_mtu_probing) + return; + + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; + } else { + mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; + mss = min(net->ipv4.sysctl_tcp_base_mss, mss); + mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } -- cgit v1.2.3 From 35e00da36cf42d54270ce25110ce304d12b18586 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Nov 2017 06:18:59 -0700 Subject: tcp: do not clear again skb->csum in tcp_init_nondata_skb() tcp_init_nondata_skb() is fed with freshly allocated skbs. They already have a cleared csum field, no need to clear it again. This is based on Neal review on commit 3b11775033dc ("tcp: do not mangle skb->cb[] in tcp_make_synack()"), noticing I did not clear skb->csum. Signed-off-by: Eric Dumazet Reported-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b98b2f7f07e7..a9d917e4dad5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -381,7 +381,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum = 0; TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; -- cgit v1.2.3 From f4e63525ee35f9c02e9f51f90571718363e9a9a9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:16 -0700 Subject: net: bpf: rename ndo_xdp to ndo_bpf ndo_xdp is a control path callback for setting up XDP in the driver. We can reuse it for other forms of communication between the eBPF stack and the drivers. Rename the callback and associated structures and definitions. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- net/core/dev.c | 34 +++++++++++++++++----------------- net/core/rtnetlink.c | 4 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1423cf4d695c..10cde58d3275 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4545,7 +4545,7 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } -static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) +static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; @@ -7090,26 +7090,26 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id) +u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) { - struct netdev_xdp xdp; + struct netdev_bpf xdp; memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG; /* Query must always succeed. */ - WARN_ON(xdp_op(dev, &xdp) < 0); + WARN_ON(bpf_op(dev, &xdp) < 0); if (prog_id) *prog_id = xdp.prog_id; return xdp.prog_attached; } -static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, +static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { - struct netdev_xdp xdp; + struct netdev_bpf xdp; memset(&xdp, 0, sizeof(xdp)); if (flags & XDP_FLAGS_HW_MODE) @@ -7120,7 +7120,7 @@ static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, xdp.flags = flags; xdp.prog = prog; - return xdp_op(dev, &xdp); + return bpf_op(dev, &xdp); } /** @@ -7137,24 +7137,24 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - xdp_op_t xdp_op, xdp_chk; + bpf_op_t bpf_op, bpf_chk; int err; ASSERT_RTNL(); - xdp_op = xdp_chk = ops->ndo_xdp; - if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) + bpf_op = bpf_chk = ops->ndo_bpf; + if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; - if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) - xdp_op = generic_xdp_install; - if (xdp_op == xdp_chk) - xdp_chk = generic_xdp_install; + if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE)) + bpf_op = generic_xdp_install; + if (bpf_op == bpf_chk) + bpf_chk = generic_xdp_install; if (fd >= 0) { - if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, xdp_op, NULL)) + __dev_xdp_attached(dev, bpf_op, NULL)) return -EBUSY; prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); @@ -7162,7 +7162,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return PTR_ERR(prog); } - err = dev_xdp_install(dev, xdp_op, extack, flags, prog); + err = dev_xdp_install(dev, bpf_op, extack, flags, prog); if (err < 0 && prog) bpf_prog_put(prog); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8a8c51937edf..dc5ad84ac096 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1270,10 +1270,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) *prog_id = generic_xdp_prog->aux->id; return XDP_ATTACHED_SKB; } - if (!ops->ndo_xdp) + if (!ops->ndo_bpf) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id); + return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From 248f346ffe9508dee0039db4ac839cb31ba3bdec Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:20 -0700 Subject: xdp: allow attaching programs loaded for specific device Pass the netdev pointer to bpf_prog_get_type(). This way BPF code can decide whether the device matches what the code was loaded/translated for. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 10cde58d3275..30b5fe32c525 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7157,7 +7157,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, __dev_xdp_attached(dev, bpf_op, NULL)) return -EBUSY; - prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (bpf_op == ops->ndo_bpf) + prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + dev); + else + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); } -- cgit v1.2.3 From 6c8dfe21c435cf2953e3cee43e12180cbc4f0820 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:21 -0700 Subject: cls_bpf: allow attaching programs loaded for specific device If TC program is loaded with skip_sw flag, we should allow the device-specific programs to be accepted. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index bc3edde1b9d7..dc9bd9a0070b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -374,7 +374,7 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) } static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, - const struct tcf_proto *tp) + u32 gen_flags, const struct tcf_proto *tp) { struct bpf_prog *fp; char *name = NULL; @@ -382,7 +382,11 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); - fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); + if (gen_flags & TCA_CLS_FLAGS_SKIP_SW) + fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, + qdisc_dev(tp->q)); + else + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); if (IS_ERR(fp)) return PTR_ERR(fp); @@ -440,7 +444,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, prog->gen_flags = gen_flags; ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : - cls_bpf_prog_from_efd(tb, prog, tp); + cls_bpf_prog_from_efd(tb, prog, gen_flags, tp); if (ret < 0) return ret; -- cgit v1.2.3 From b37a530613104aa3f592376c67a462823298759c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:30 -0700 Subject: bpf: remove old offload/analyzer Thanks to the ability to load a program for a specific device, running verifier twice is no longer needed. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- net/core/filter.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a0112168d6f9..1afa17935954 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3777,25 +3777,6 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } -static bool -tc_cls_act_is_valid_access_analyzer(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - switch (off) { - case offsetof(struct sk_buff, len): - return true; - case offsetof(struct sk_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct sk_buff, cb) + - offsetof(struct bpf_skb_data_end, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; -} - static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -3830,21 +3811,6 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -static bool xdp_is_valid_access_analyzer(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - switch (off) { - case offsetof(struct xdp_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct xdp_buff, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; -} - void bpf_warn_invalid_xdp_action(u32 act) { const u32 act_max = XDP_REDIRECT; @@ -4516,10 +4482,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .gen_prologue = tc_cls_act_prologue, }; -const struct bpf_verifier_ops tc_cls_act_analyzer_ops = { - .is_valid_access = tc_cls_act_is_valid_access_analyzer, -}; - const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; @@ -4530,10 +4492,6 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; -const struct bpf_verifier_ops xdp_analyzer_ops = { - .is_valid_access = xdp_is_valid_access_analyzer, -}; - const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; -- cgit v1.2.3 From 49463b7f2da1a115404b02c5533bc2c2125833a3 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:21 -0400 Subject: net: dsa: make tree index unsigned Similarly to a DSA switch and port, rename the tree index from "tree" to "index" and make it an unsigned int because it isn't supposed to be less than 0. u32 is an OF specific data used to retrieve the value and has no need to be propagated up to the tree index. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 14 +++++++------- net/dsa/slave.c | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 797d1156b4e6..8b68dc2f5707 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -27,12 +27,12 @@ static DEFINE_MUTEX(dsa2_mutex); static const struct devlink_ops dsa_devlink_ops = { }; -static struct dsa_switch_tree *dsa_get_dst(u32 tree) +static struct dsa_switch_tree *dsa_get_dst(unsigned int index) { struct dsa_switch_tree *dst; list_for_each_entry(dst, &dsa_switch_trees, list) - if (dst->tree == tree) { + if (dst->index == index) { kref_get(&dst->refcount); return dst; } @@ -53,14 +53,14 @@ static void dsa_put_dst(struct dsa_switch_tree *dst) kref_put(&dst->refcount, dsa_free_dst); } -static struct dsa_switch_tree *dsa_add_dst(u32 tree) +static struct dsa_switch_tree *dsa_add_dst(unsigned int index) { struct dsa_switch_tree *dst; dst = kzalloc(sizeof(*dst), GFP_KERNEL); if (!dst) return NULL; - dst->tree = tree; + dst->index = index; INIT_LIST_HEAD(&dst->list); list_add_tail(&dsa_switch_trees, &dst->list); kref_init(&dst->refcount); @@ -454,7 +454,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dst->cpu_dp = NULL; - pr_info("DSA: tree %d unapplied\n", dst->tree); + pr_info("DSA: tree %d unapplied\n", dst->index); dst->applied = false; } @@ -504,7 +504,7 @@ static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) } - pr_info("DSA: switch %d %d parsed\n", dst->tree, ds->index); + pr_info("DSA: switch %d %d parsed\n", dst->index, ds->index); return 0; } @@ -549,7 +549,7 @@ static int dsa_dst_parse(struct dsa_switch_tree *dst) } } - pr_info("DSA: tree %d parsed\n", dst->tree); + pr_info("DSA: tree %d parsed\n", dst->index); return 0; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 9b75d0ac4092..814ced75a0cc 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -55,7 +55,7 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->read = dsa_slave_phy_read; ds->slave_mii_bus->write = dsa_slave_phy_write; snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d", - ds->dst->tree, ds->index); + ds->dst->index, ds->index); ds->slave_mii_bus->parent = ds->dev; ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask; } -- cgit v1.2.3 From 8e5bf9759a06be2251fa96cfd8b412f1808c62f9 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:22 -0400 Subject: net: dsa: simplify tree reference counting DSA trees have a refcount used to automatically free the dsa_switch_tree structure once there is no switch devices inside of it. The refcount is incremented when a switch is added to the tree, and decremented when it is removed from it. But because of kref_init, the refcount is also incremented at initialization, and when looking up the tree from the list for symmetry. Thus the current code stores the number of switches plus one, and makes the switch registration more complex. To simplify the switch registration function, we reset the refcount to zero after initialization and don't increment it when looking up a tree. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 8b68dc2f5707..d3f1a7607463 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -32,10 +32,9 @@ static struct dsa_switch_tree *dsa_get_dst(unsigned int index) struct dsa_switch_tree *dst; list_for_each_entry(dst, &dsa_switch_trees, list) - if (dst->index == index) { - kref_get(&dst->refcount); + if (dst->index == index) return dst; - } + return NULL; } @@ -48,11 +47,6 @@ static void dsa_free_dst(struct kref *ref) kfree(dst); } -static void dsa_put_dst(struct dsa_switch_tree *dst) -{ - kref_put(&dst->refcount, dsa_free_dst); -} - static struct dsa_switch_tree *dsa_add_dst(unsigned int index) { struct dsa_switch_tree *dst; @@ -63,7 +57,10 @@ static struct dsa_switch_tree *dsa_add_dst(unsigned int index) dst->index = index; INIT_LIST_HEAD(&dst->list); list_add_tail(&dsa_switch_trees, &dst->list); + + /* Initialize the reference counter to the number of switches, not 1 */ kref_init(&dst->refcount); + refcount_set(&dst->refcount.refcount, 0); return dst; } @@ -739,10 +736,8 @@ static int _dsa_register_switch(struct dsa_switch *ds) return -ENOMEM; } - if (dst->ds[index]) { - err = -EBUSY; - goto out; - } + if (dst->ds[index]) + return -EBUSY; ds->dst = dst; ds->index = index; @@ -758,11 +753,9 @@ static int _dsa_register_switch(struct dsa_switch *ds) if (err < 0) goto out_del_dst; - if (err == 1) { - /* Not all switches registered yet */ - err = 0; - goto out; - } + /* Not all switches registered yet */ + if (err == 1) + return 0; if (dst->applied) { pr_info("DSA: Disjoint trees?\n"); @@ -779,13 +772,10 @@ static int _dsa_register_switch(struct dsa_switch *ds) goto out_del_dst; } - dsa_put_dst(dst); return 0; out_del_dst: dsa_dst_del_ds(dst, ds, ds->index); -out: - dsa_put_dst(dst); return err; } -- cgit v1.2.3 From 65254108b4655dc55e8d8f62ee895960085e73f4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:23 -0400 Subject: net: dsa: get and put tree reference counting Provide convenient dsa_tree_get and dsa_tree_put functions scoping a DSA tree used to increment and decrement its reference counter, instead of poking directly its kref structure. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index d3f1a7607463..609d92684505 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -38,15 +38,6 @@ static struct dsa_switch_tree *dsa_get_dst(unsigned int index) return NULL; } -static void dsa_free_dst(struct kref *ref) -{ - struct dsa_switch_tree *dst = container_of(ref, struct dsa_switch_tree, - refcount); - - list_del(&dst->list); - kfree(dst); -} - static struct dsa_switch_tree *dsa_add_dst(unsigned int index) { struct dsa_switch_tree *dst; @@ -65,10 +56,35 @@ static struct dsa_switch_tree *dsa_add_dst(unsigned int index) return dst; } +static void dsa_tree_free(struct dsa_switch_tree *dst) +{ + list_del(&dst->list); + kfree(dst); +} + +static void dsa_tree_get(struct dsa_switch_tree *dst) +{ + kref_get(&dst->refcount); +} + +static void dsa_tree_release(struct kref *ref) +{ + struct dsa_switch_tree *dst; + + dst = container_of(ref, struct dsa_switch_tree, refcount); + + dsa_tree_free(dst); +} + +static void dsa_tree_put(struct dsa_switch_tree *dst) +{ + kref_put(&dst->refcount, dsa_tree_release); +} + static void dsa_dst_add_ds(struct dsa_switch_tree *dst, struct dsa_switch *ds, u32 index) { - kref_get(&dst->refcount); + dsa_tree_get(dst); dst->ds[index] = ds; } @@ -76,7 +92,7 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst, struct dsa_switch *ds, u32 index) { dst->ds[index] = NULL; - kref_put(&dst->refcount, dsa_free_dst); + dsa_tree_put(dst); } /* For platform data configurations, we need to have a valid name argument to -- cgit v1.2.3 From 1ca28ec9abff927178b1ea9d6431e4c320145b10 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:24 -0400 Subject: net: dsa: provide a find or new tree helper Rename dsa_get_dst to dsa_tree_find since it doesn't increment the reference counter, rename dsa_add_dst to dsa_tree_alloc for symmetry with dsa_tree_free, and provide a convenient dsa_tree_touch function to find or allocate a new tree. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 609d92684505..bda222cfc02c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -21,33 +21,35 @@ #include "dsa_priv.h" -static LIST_HEAD(dsa_switch_trees); +static LIST_HEAD(dsa_tree_list); static DEFINE_MUTEX(dsa2_mutex); static const struct devlink_ops dsa_devlink_ops = { }; -static struct dsa_switch_tree *dsa_get_dst(unsigned int index) +static struct dsa_switch_tree *dsa_tree_find(int index) { struct dsa_switch_tree *dst; - list_for_each_entry(dst, &dsa_switch_trees, list) + list_for_each_entry(dst, &dsa_tree_list, list) if (dst->index == index) return dst; return NULL; } -static struct dsa_switch_tree *dsa_add_dst(unsigned int index) +static struct dsa_switch_tree *dsa_tree_alloc(int index) { struct dsa_switch_tree *dst; dst = kzalloc(sizeof(*dst), GFP_KERNEL); if (!dst) return NULL; + dst->index = index; + INIT_LIST_HEAD(&dst->list); - list_add_tail(&dsa_switch_trees, &dst->list); + list_add_tail(&dsa_tree_list, &dst->list); /* Initialize the reference counter to the number of switches, not 1 */ kref_init(&dst->refcount); @@ -62,6 +64,17 @@ static void dsa_tree_free(struct dsa_switch_tree *dst) kfree(dst); } +static struct dsa_switch_tree *dsa_tree_touch(int index) +{ + struct dsa_switch_tree *dst; + + dst = dsa_tree_find(index); + if (!dst) + dst = dsa_tree_alloc(index); + + return dst; +} + static void dsa_tree_get(struct dsa_switch_tree *dst) { kref_get(&dst->refcount); @@ -745,12 +758,9 @@ static int _dsa_register_switch(struct dsa_switch *ds) return err; } - dst = dsa_get_dst(tree); - if (!dst) { - dst = dsa_add_dst(tree); - if (!dst) - return -ENOMEM; - } + dst = dsa_tree_touch(tree); + if (!dst) + return -ENOMEM; if (dst->ds[index]) return -EBUSY; -- cgit v1.2.3 From 6da2a940ac6a0680e50b3aaf945e409cea03c346 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:25 -0400 Subject: net: dsa: rework switch addition and removal This patch removes the unnecessary index argument from the dsa_dst_add_ds and dsa_dst_del_ds functions and renames them to dsa_tree_add_switch and dsa_tree_remove_switch respectively. In addition to a more explicit scope, we now check the presence of an existing switch with the same index directly within dsa_tree_add_switch. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index bda222cfc02c..5b6a3dad8015 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -94,20 +94,6 @@ static void dsa_tree_put(struct dsa_switch_tree *dst) kref_put(&dst->refcount, dsa_tree_release); } -static void dsa_dst_add_ds(struct dsa_switch_tree *dst, - struct dsa_switch *ds, u32 index) -{ - dsa_tree_get(dst); - dst->ds[index] = ds; -} - -static void dsa_dst_del_ds(struct dsa_switch_tree *dst, - struct dsa_switch *ds, u32 index) -{ - dst->ds[index] = NULL; - dsa_tree_put(dst); -} - /* For platform data configurations, we need to have a valid name argument to * differentiate a disabled port from an enabled one */ @@ -484,6 +470,27 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dst->applied = false; } +static void dsa_tree_remove_switch(struct dsa_switch_tree *dst, + unsigned int index) +{ + dst->ds[index] = NULL; + dsa_tree_put(dst); +} + +static int dsa_tree_add_switch(struct dsa_switch_tree *dst, + struct dsa_switch *ds) +{ + unsigned int index = ds->index; + + if (dst->ds[index]) + return -EBUSY; + + dsa_tree_get(dst); + dst->ds[index] = ds; + + return 0; +} + static int dsa_cpu_parse(struct dsa_port *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) @@ -762,9 +769,6 @@ static int _dsa_register_switch(struct dsa_switch *ds) if (!dst) return -ENOMEM; - if (dst->ds[index]) - return -EBUSY; - ds->dst = dst; ds->index = index; ds->cd = pdata; @@ -773,7 +777,9 @@ static int _dsa_register_switch(struct dsa_switch *ds) for (i = 0; i < DSA_MAX_SWITCHES; ++i) ds->rtable[i] = DSA_RTABLE_NONE; - dsa_dst_add_ds(dst, ds, index); + err = dsa_tree_add_switch(dst, ds); + if (err) + return err; err = dsa_dst_complete(dst); if (err < 0) @@ -801,7 +807,7 @@ static int _dsa_register_switch(struct dsa_switch *ds) return 0; out_del_dst: - dsa_dst_del_ds(dst, ds, ds->index); + dsa_tree_remove_switch(dst, index); return err; } @@ -843,10 +849,11 @@ EXPORT_SYMBOL_GPL(dsa_register_switch); static void _dsa_unregister_switch(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; + unsigned int index = ds->index; dsa_dst_unapply(dst); - dsa_dst_del_ds(dst, ds, ds->index); + dsa_tree_remove_switch(dst, index); } void dsa_unregister_switch(struct dsa_switch *ds) -- cgit v1.2.3 From 0eefe2c1730020c6207bc9695fd466e558301dbb Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:26 -0400 Subject: net: dsa: get tree before parsing ports We will need a reference to the dsa_switch_tree when parsing a CPU port, so fetch it right after parsing the member and before parsing ports. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 5b6a3dad8015..5918fbddb0ab 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -751,18 +751,10 @@ static int _dsa_register_switch(struct dsa_switch *ds) err = dsa_parse_member_dn(np, &tree, &index); if (err) return err; - - err = dsa_parse_ports_of(np, ds); - if (err) - return err; } else { err = dsa_parse_member(pdata, &tree, &index); if (err) return err; - - err = dsa_parse_ports(pdata, ds); - if (err) - return err; } dst = dsa_tree_touch(tree); @@ -773,6 +765,16 @@ static int _dsa_register_switch(struct dsa_switch *ds) ds->index = index; ds->cd = pdata; + if (np) { + err = dsa_parse_ports_of(np, ds); + if (err) + return err; + } else { + err = dsa_parse_ports(pdata, ds); + if (err) + return err; + } + /* Initialize the routing table */ for (i = 0; i < DSA_MAX_SWITCHES; ++i) ds->rtable[i] = DSA_RTABLE_NONE; -- cgit v1.2.3 From 975e6e32215e6cbc09b65d762865b1a46e8e9103 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:27 -0400 Subject: net: dsa: rework switch parsing When parsing a switch, we have to identify to which tree it belongs and parse its ports. Provide two functions to separate the OF and platform data specific paths. Also use the of_property_read_variable_u32_array function to parse the OF member array instead of calling of_property_read_u32_index twice. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 117 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 58 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 5918fbddb0ab..dfcb6247f2f2 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -617,7 +617,8 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) return 0; } -static int dsa_parse_ports_of(struct device_node *dn, struct dsa_switch *ds) +static int dsa_switch_parse_ports_of(struct dsa_switch *ds, + struct device_node *dn) { struct device_node *ports, *port; struct dsa_port *dp; @@ -648,6 +649,39 @@ static int dsa_parse_ports_of(struct device_node *dn, struct dsa_switch *ds) return 0; } +static int dsa_switch_parse_member_of(struct dsa_switch *ds, + struct device_node *dn) +{ + u32 m[2] = { 0, 0 }; + int sz; + + /* Don't error out if this optional property isn't found */ + sz = of_property_read_variable_u32_array(dn, "dsa,member", m, 2, 2); + if (sz < 0 && sz != -EINVAL) + return sz; + + ds->index = m[1]; + if (ds->index >= DSA_MAX_SWITCHES) + return -EINVAL; + + ds->dst = dsa_tree_touch(m[0]); + if (!ds->dst) + return -ENOMEM; + + return 0; +} + +static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) +{ + int err; + + err = dsa_switch_parse_member_of(ds, dn); + if (err) + return err; + + return dsa_switch_parse_ports_of(ds, dn); +} + static int dsa_port_parse(struct dsa_port *dp, const char *name, struct device *dev) { @@ -673,7 +707,8 @@ static int dsa_port_parse(struct dsa_port *dp, const char *name, return 0; } -static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds) +static int dsa_switch_parse_ports(struct dsa_switch *ds, + struct dsa_chip_data *cd) { bool valid_name_found = false; struct dsa_port *dp; @@ -703,40 +738,19 @@ static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds) return 0; } -static int dsa_parse_member_dn(struct device_node *np, u32 *tree, u32 *index) -{ - int err; - - *tree = *index = 0; - - err = of_property_read_u32_index(np, "dsa,member", 0, tree); - if (err) { - /* Does not exist, but it is optional */ - if (err == -EINVAL) - return 0; - return err; - } - - err = of_property_read_u32_index(np, "dsa,member", 1, index); - if (err) - return err; - - if (*index >= DSA_MAX_SWITCHES) - return -EINVAL; - - return 0; -} - -static int dsa_parse_member(struct dsa_chip_data *pd, u32 *tree, u32 *index) +static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) { - if (!pd) - return -ENODEV; + ds->cd = cd; - /* We do not support complex trees with dsa_chip_data */ - *tree = 0; - *index = 0; + /* We don't support interconnected switches nor multiple trees via + * platform data, so this is the unique switch of the tree. + */ + ds->index = 0; + ds->dst = dsa_tree_touch(0); + if (!ds->dst) + return -ENOMEM; - return 0; + return dsa_switch_parse_ports(ds, cd); } static int _dsa_register_switch(struct dsa_switch *ds) @@ -744,36 +758,21 @@ static int _dsa_register_switch(struct dsa_switch *ds) struct dsa_chip_data *pdata = ds->dev->platform_data; struct device_node *np = ds->dev->of_node; struct dsa_switch_tree *dst; - u32 tree, index; + unsigned int index; int i, err; - if (np) { - err = dsa_parse_member_dn(np, &tree, &index); - if (err) - return err; - } else { - err = dsa_parse_member(pdata, &tree, &index); - if (err) - return err; - } + if (np) + err = dsa_switch_parse_of(ds, np); + else if (pdata) + err = dsa_switch_parse(ds, pdata); + else + err = -ENODEV; - dst = dsa_tree_touch(tree); - if (!dst) - return -ENOMEM; - - ds->dst = dst; - ds->index = index; - ds->cd = pdata; + if (err) + return err; - if (np) { - err = dsa_parse_ports_of(np, ds); - if (err) - return err; - } else { - err = dsa_parse_ports(pdata, ds); - if (err) - return err; - } + index = ds->index; + dst = ds->dst; /* Initialize the routing table */ for (i = 0; i < DSA_MAX_SWITCHES; ++i) -- cgit v1.2.3 From 54df6fa9541752b3b28106eb1a9764dcda815fd2 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:28 -0400 Subject: net: dsa: only check presence of link property When parsing a port, simply use of_property_read_bool which checks the presence of a given property, instead of parsing the link phandle. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dfcb6247f2f2..06bcdb6bc796 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -590,8 +590,8 @@ static int dsa_dst_parse(struct dsa_switch_tree *dst) static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) { struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0); - struct device_node *link = of_parse_phandle(dn, "link", 0); const char *name = of_get_property(dn, "label", NULL); + bool link = of_property_read_bool(dn, "link"); if (ethernet) { struct net_device *master; -- cgit v1.2.3 From 06e24d0868a361774fc46d0819450915e63a2815 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:29 -0400 Subject: net: dsa: add one port parsing function per type Add dsa_port_parse_user, dsa_port_parse_dsa and dsa_port_parse_cpu functions to factorize the code shared by both OF and pdata parsing. They don't do much for the moment but will be extended later to support tagging protocol resolution for example. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 56 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 06bcdb6bc796..271a97ef5bf6 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -491,6 +491,32 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst, return 0; } +static int dsa_port_parse_user(struct dsa_port *dp, const char *name) +{ + if (!name) + name = "eth%d"; + + dp->type = DSA_PORT_TYPE_USER; + dp->name = name; + + return 0; +} + +static int dsa_port_parse_dsa(struct dsa_port *dp) +{ + dp->type = DSA_PORT_TYPE_DSA; + + return 0; +} + +static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) +{ + dp->type = DSA_PORT_TYPE_CPU; + dp->master = master; + + return 0; +} + static int dsa_cpu_parse(struct dsa_port *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) @@ -593,6 +619,8 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) const char *name = of_get_property(dn, "label", NULL); bool link = of_property_read_bool(dn, "link"); + dp->dn = dn; + if (ethernet) { struct net_device *master; @@ -600,21 +628,13 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) if (!master) return -EPROBE_DEFER; - dp->type = DSA_PORT_TYPE_CPU; - dp->master = master; - } else if (link) { - dp->type = DSA_PORT_TYPE_DSA; - } else { - if (!name) - name = "eth%d"; - - dp->type = DSA_PORT_TYPE_USER; - dp->name = name; + return dsa_port_parse_cpu(dp, master); } - dp->dn = dn; + if (link) + return dsa_port_parse_dsa(dp); - return 0; + return dsa_port_parse_user(dp, name); } static int dsa_switch_parse_ports_of(struct dsa_switch *ds, @@ -694,17 +714,13 @@ static int dsa_port_parse(struct dsa_port *dp, const char *name, dev_put(master); - dp->type = DSA_PORT_TYPE_CPU; - dp->master = master; - } else if (!strcmp(name, "dsa")) { - dp->type = DSA_PORT_TYPE_DSA; - } else { - dp->type = DSA_PORT_TYPE_USER; + return dsa_port_parse_cpu(dp, master); } - dp->name = name; + if (!strcmp(name, "dsa")) + return dsa_port_parse_dsa(dp); - return 0; + return dsa_port_parse_user(dp, name); } static int dsa_switch_parse_ports(struct dsa_switch *ds, -- cgit v1.2.3 From 7354fcb0a3a3a5518107f8de117a6ce5ce08cc7c Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:30 -0400 Subject: net: dsa: resolve tagging protocol at parse time Extend the dsa_port_parse_cpu() function to resolve the tagging protocol at port parsing time, instead of waiting for the whole tree to be complete. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 271a97ef5bf6..283104e5ca6a 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -511,8 +511,23 @@ static int dsa_port_parse_dsa(struct dsa_port *dp) static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) { + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst = ds->dst; + const struct dsa_device_ops *tag_ops; + enum dsa_tag_protocol tag_protocol; + + tag_protocol = ds->ops->get_tag_protocol(ds); + tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(tag_ops)) { + dev_warn(ds->dev, "No tagger for this switch\n"); + return PTR_ERR(tag_ops); + } + dp->type = DSA_PORT_TYPE_CPU; + dp->rcv = tag_ops->rcv; + dp->tag_ops = tag_ops; dp->master = master; + dp->dst = dst; return 0; } @@ -521,25 +536,9 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) { - const struct dsa_device_ops *tag_ops; - enum dsa_tag_protocol tag_protocol; - if (!dst->cpu_dp) dst->cpu_dp = port; - tag_protocol = ds->ops->get_tag_protocol(ds); - tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(tag_ops)) { - dev_warn(ds->dev, "No tagger for this switch\n"); - return PTR_ERR(tag_ops); - } - - dst->cpu_dp->tag_ops = tag_ops; - - /* Make a few copies for faster access in master receive hot path */ - dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; - dst->cpu_dp->dst = dst; - return 0; } -- cgit v1.2.3 From 1f2556916d974cfb62b6af51660186b5f58bd869 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Fri, 3 Nov 2017 16:38:48 -0700 Subject: tcp: higher throughput under reordering with adaptive RACK reordering wnd Currently TCP RACK loss detection does not work well if packets are being reordered beyond its static reordering window (min_rtt/4).Under such reordering it may falsely trigger loss recoveries and reduce TCP throughput significantly. This patch improves that by increasing and reducing the reordering window based on DSACK, which is now supported in major TCP implementations. It makes RACK's reo_wnd adaptive based on DSACK and no. of recoveries. - If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded by srtt), since there is possibility that spurious retransmission was due to reordering delay longer than reo_wnd. - Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) no. of successful recoveries (accounts for full DSACK-based loss recovery undo). After that, reset it to default (min_rtt/4). - At max, reo_wnd is incremented only once per rtt. So that the new DSACK on which we are reacting, is due to the spurious retx (approx) after the reo_wnd has been updated last time. - reo_wnd is tracked in terms of steps (of min_rtt/4), rather than absolute value to account for change in rtt. In our internal testing, we observed significant increase in throughput, in scenarios where reordering exceeds min_rtt/4 (previous static value). Signed-off-by: Priyaranjan Jha Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 7 +++++++ net/ipv4/tcp_minisocks.c | 4 ++++ net/ipv4/tcp_recovery.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 58 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a7a0f316eb86..c4cb19ed4628 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk) tcp_assign_congestion_control(sk); tp->tsoffset = 0; + tp->rack.reo_wnd_steps = 1; sk->sk_state = TCP_CLOSE; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8393b405ea98..0ada8bfc2ebd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -856,6 +856,7 @@ void tcp_disable_fack(struct tcp_sock *tp) static void tcp_dsack_seen(struct tcp_sock *tp) { tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; + tp->rack.dsack_seen = 1; } static void tcp_update_reordering(struct sock *sk, const int metric, @@ -2408,6 +2409,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) mib_idx = LINUX_MIB_TCPFULLUNDO; NET_INC_STATS(sock_net(sk), mib_idx); + } else if (tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_persist--; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq @@ -2427,6 +2430,8 @@ static bool tcp_try_undo_dsack(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && !tp->undo_retrans) { + tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH, + tp->rack.reo_wnd_persist + 1); DBGUNDO(sk, "D-SACK"); tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); @@ -3644,6 +3649,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state); + tcp_rack_update_reo_wnd(sk, &rs); + if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); /* If needed, reset TLP/RTO timer; RACK may later override this. */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3c65c1a3f944..4bb86580decd 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -551,6 +551,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->syn_data_acked = 0; newtp->rack.mstamp = 0; newtp->rack.advanced = 0; + newtp->rack.reo_wnd_steps = 1; + newtp->rack.last_delivered = 0; + newtp->rack.reo_wnd_persist = 0; + newtp->rack.dsack_seen = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index ac3e9c6d3a3d..d3ea89020c69 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -44,6 +44,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); + u32 min_rtt = tcp_min_rtt(tp); struct sk_buff *skb, *n; u32 reo_wnd; @@ -54,8 +55,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) * to queuing or delayed ACKs. */ reo_wnd = 1000; - if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) - reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); + if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) { + reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); + reo_wnd = min(reo_wnd, tp->srtt_us >> 3); + } list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, tcp_tsorted_anchor) { @@ -160,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk) if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) tcp_rearm_rto(sk); } + +/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. + * + * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded + * by srtt), since there is possibility that spurious retransmission was + * due to reordering delay longer than reo_wnd. + * + * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) + * no. of successful recoveries (accounts for full DSACK-based loss + * recovery undo). After that, reset it to default (min_rtt/4). + * + * At max, reo_wnd is incremented only once per rtt. So that the new + * DSACK on which we are reacting, is due to the spurious retx (approx) + * after the reo_wnd has been updated last time. + * + * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than + * absolute value to account for change in rtt. + */ +void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND || + !rs->prior_delivered) + return; + + /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ + if (before(rs->prior_delivered, tp->rack.last_delivered)) + tp->rack.dsack_seen = 0; + + /* Adjust the reo_wnd if update is pending */ + if (tp->rack.dsack_seen) { + tp->rack.reo_wnd_steps = min_t(u32, 0xFF, + tp->rack.reo_wnd_steps + 1); + tp->rack.dsack_seen = 0; + tp->rack.last_delivered = tp->delivered; + tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; + } else if (!tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_steps = 1; + } +} -- cgit v1.2.3 From c45e3e4c5b134b081e8af362109905427967eb19 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Sun, 9 Jul 2017 13:08:58 +0200 Subject: NFC: fix device-allocation error return A recent change fixing NFC device allocation itself introduced an error-handling bug by returning an error pointer in case device-id allocation failed. This is clearly broken as the callers still expected NULL to be returned on errors as detected by Dan's static checker. Fix this up by returning NULL in the event that we've run out of memory when allocating a new device id. Note that the offending commit is marked for stable (3.8) so this fix needs to be backported along with it. Fixes: 20777bc57c34 ("NFC: fix broken device allocation") Cc: stable # 3.8 Reported-by: Dan Carpenter Signed-off-by: Johan Hovold Signed-off-by: Samuel Ortiz --- net/nfc/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/core.c b/net/nfc/core.c index e5e23c2cbe74..2c7c9b357e70 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -1105,7 +1105,7 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, err_free_dev: kfree(dev); - return ERR_PTR(rc); + return NULL; } EXPORT_SYMBOL(nfc_allocate_device); -- cgit v1.2.3 From 4b519bb493e0866de7659b88dd22dc2cd89dd628 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Wed, 11 Oct 2017 16:03:44 +0530 Subject: NFC: Convert timers to use timer_setup() Switch to using the new timer_setup() and from_timer() for net/nfc/* Signed-off-by: Allen Pais Reviewed-by: Kees Cook Signed-off-by: Samuel Ortiz --- net/nfc/core.c | 8 +++----- net/nfc/hci/core.c | 7 +++---- net/nfc/hci/llc_shdlc.c | 23 +++++++++-------------- net/nfc/llcp_core.c | 14 ++++++-------- 4 files changed, 21 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/nfc/core.c b/net/nfc/core.c index 2c7c9b357e70..947a470f929d 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -1015,9 +1015,9 @@ exit: device_unlock(&dev->dev); } -static void nfc_check_pres_timeout(unsigned long data) +static void nfc_check_pres_timeout(struct timer_list *t) { - struct nfc_dev *dev = (struct nfc_dev *)data; + struct nfc_dev *dev = from_timer(dev, t, check_pres_timer); schedule_work(&dev->check_pres_work); } @@ -1094,9 +1094,7 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, dev->targets_generation = 1; if (ops->check_presence) { - setup_timer(&dev->check_pres_timer, nfc_check_pres_timeout, - (unsigned long)dev); - + timer_setup(&dev->check_pres_timer, nfc_check_pres_timeout, 0); INIT_WORK(&dev->check_pres_work, nfc_check_pres_work); } diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index a8a6e7814e09..ac8030c4bcf8 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -428,9 +428,9 @@ exit_noskb: nfc_hci_driver_failure(hdev, r); } -static void nfc_hci_cmd_timeout(unsigned long data) +static void nfc_hci_cmd_timeout(struct timer_list *t) { - struct nfc_hci_dev *hdev = (struct nfc_hci_dev *)data; + struct nfc_hci_dev *hdev = from_timer(hdev, t, cmd_timer); schedule_work(&hdev->msg_tx_work); } @@ -1004,8 +1004,7 @@ int nfc_hci_register_device(struct nfc_hci_dev *hdev) INIT_WORK(&hdev->msg_tx_work, nfc_hci_msg_tx_work); - setup_timer(&hdev->cmd_timer, nfc_hci_cmd_timeout, - (unsigned long)hdev); + timer_setup(&hdev->cmd_timer, nfc_hci_cmd_timeout, 0); skb_queue_head_init(&hdev->rx_hcp_frags); diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 58df37eae1e8..fe988936ad92 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -580,27 +580,27 @@ static void llc_shdlc_handle_send_queue(struct llc_shdlc *shdlc) } } -static void llc_shdlc_connect_timeout(unsigned long data) +static void llc_shdlc_connect_timeout(struct timer_list *t) { - struct llc_shdlc *shdlc = (struct llc_shdlc *)data; + struct llc_shdlc *shdlc = from_timer(shdlc, t, connect_timer); pr_debug("\n"); schedule_work(&shdlc->sm_work); } -static void llc_shdlc_t1_timeout(unsigned long data) +static void llc_shdlc_t1_timeout(struct timer_list *t) { - struct llc_shdlc *shdlc = (struct llc_shdlc *)data; + struct llc_shdlc *shdlc = from_timer(shdlc, t, t1_timer); pr_debug("SoftIRQ: need to send ack\n"); schedule_work(&shdlc->sm_work); } -static void llc_shdlc_t2_timeout(unsigned long data) +static void llc_shdlc_t2_timeout(struct timer_list *t) { - struct llc_shdlc *shdlc = (struct llc_shdlc *)data; + struct llc_shdlc *shdlc = from_timer(shdlc, t, t2_timer); pr_debug("SoftIRQ: need to retransmit\n"); @@ -763,14 +763,9 @@ static void *llc_shdlc_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv, mutex_init(&shdlc->state_mutex); shdlc->state = SHDLC_DISCONNECTED; - setup_timer(&shdlc->connect_timer, llc_shdlc_connect_timeout, - (unsigned long)shdlc); - - setup_timer(&shdlc->t1_timer, llc_shdlc_t1_timeout, - (unsigned long)shdlc); - - setup_timer(&shdlc->t2_timer, llc_shdlc_t2_timeout, - (unsigned long)shdlc); + timer_setup(&shdlc->connect_timer, llc_shdlc_connect_timeout, 0); + timer_setup(&shdlc->t1_timer, llc_shdlc_t1_timeout, 0); + timer_setup(&shdlc->t2_timer, llc_shdlc_t2_timeout, 0); shdlc->w = SHDLC_MAX_WINDOW; shdlc->srej_support = SHDLC_SREJ_SUPPORT; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 7988185072e5..ef4026a23e80 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -242,9 +242,9 @@ static void nfc_llcp_timeout_work(struct work_struct *work) nfc_dep_link_down(local->dev); } -static void nfc_llcp_symm_timer(unsigned long data) +static void nfc_llcp_symm_timer(struct timer_list *t) { - struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; + struct nfc_llcp_local *local = from_timer(local, t, link_timer); pr_err("SYMM timeout\n"); @@ -285,9 +285,9 @@ static void nfc_llcp_sdreq_timeout_work(struct work_struct *work) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); } -static void nfc_llcp_sdreq_timer(unsigned long data) +static void nfc_llcp_sdreq_timer(struct timer_list *t) { - struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; + struct nfc_llcp_local *local = from_timer(local, t, sdreq_timer); schedule_work(&local->sdreq_timeout_work); } @@ -1573,8 +1573,7 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); - setup_timer(&local->link_timer, nfc_llcp_symm_timer, - (unsigned long)local); + timer_setup(&local->link_timer, nfc_llcp_symm_timer, 0); skb_queue_head_init(&local->tx_queue); INIT_WORK(&local->tx_work, nfc_llcp_tx_work); @@ -1600,8 +1599,7 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) mutex_init(&local->sdreq_lock); INIT_HLIST_HEAD(&local->pending_sdreqs); - setup_timer(&local->sdreq_timer, nfc_llcp_sdreq_timer, - (unsigned long)local); + timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); list_add(&local->list, &llcp_devices); -- cgit v1.2.3 From c5cc0c697149345ded2f792a919097cd51464274 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 6 Sep 2017 22:28:00 +0200 Subject: netfilter: ipvs: Use %pS printk format for direct addresses The debug and error printk functions in ipvs uses wrongly the %pF instead of the %pS printk format specifier for printing symbols for the address returned by _builtin_return_address(0). Fix it for the ia64, ppc64 and parisc64 architectures. Signed-off-by: Helge Deller Cc: Wensong Zhang Cc: netdev@vger.kernel.org Cc: lvs-devel@vger.kernel.org Cc: netfilter-devel@vger.kernel.org Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 3d2ac71a83ec..f73561ca982d 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -185,7 +185,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); ret = 1; } else { - pr_err("%s(): request for already hashed, called from %pF\n", + pr_err("%s(): request for already hashed, called from %pS\n", __func__, __builtin_return_address(0)); ret = 0; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4f940d7eb2f7..b825835752e6 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -300,7 +300,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { - pr_err("%s(): request for already hashed, called from %pF\n", + pr_err("%s(): request for already hashed, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } @@ -334,7 +334,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) static int ip_vs_svc_unhash(struct ip_vs_service *svc) { if (!(svc->flags & IP_VS_SVC_F_HASHED)) { - pr_err("%s(): request for unhash flagged, called from %pF\n", + pr_err("%s(): request for unhash flagged, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } -- cgit v1.2.3 From c5504f724c86ee925e7ffb80aa342cfd57959b13 Mon Sep 17 00:00:00 2001 From: KUWAZAWA Takuya Date: Sun, 15 Oct 2017 20:54:10 +0900 Subject: netfilter: ipvs: Fix inappropriate output of procfs Information about ipvs in different network namespace can be seen via procfs. How to reproduce: # ip netns add ns01 # ip netns add ns02 # ip netns exec ns01 ip a add dev lo 127.0.0.1/8 # ip netns exec ns02 ip a add dev lo 127.0.0.1/8 # ip netns exec ns01 ipvsadm -A -t 10.1.1.1:80 # ip netns exec ns02 ipvsadm -A -t 10.1.1.2:80 The ipvsadm displays information about its own network namespace only. # ip netns exec ns01 ipvsadm -Ln IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP 10.1.1.1:80 wlc # ip netns exec ns02 ipvsadm -Ln IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP 10.1.1.2:80 wlc But I can see information about other network namespace via procfs. # ip netns exec ns01 cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP 0A010101:0050 wlc TCP 0A010102:0050 wlc # ip netns exec ns02 cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP 0A010102:0050 wlc Signed-off-by: KUWAZAWA Takuya Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b825835752e6..fac8c802b4ea 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2034,12 +2034,16 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) seq_puts(seq, " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); } else { + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); char *sched_name = sched ? sched->name : "none"; + if (svc->ipvs != ipvs) + return 0; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) -- cgit v1.2.3 From 9912156c2e42a5b0100da37275622262ece02c05 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 16 Oct 2017 11:24:02 +0100 Subject: netfilter: ebtables: clean up initialization of buf buf is initialized to buf_start and then set on the next statement to buf_start + offsets[i]. Clean this up to just initialize buf to buf_start + offsets[i] to clean up the clang build warning: "Value stored to 'buf' during its initialization is never read" Signed-off-by: Colin Ian King Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 83951f978445..54c274dbf4f1 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2111,9 +2111,8 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, for (i = 0, j = 1 ; j < 4 ; j++, i++) { struct compat_ebt_entry_mwt *match32; unsigned int size; - char *buf = buf_start; + char *buf = buf_start + offsets[i]; - buf = buf_start + offsets[i]; if (offsets[i] > offsets[j]) return -EINVAL; -- cgit v1.2.3 From b1fc1372c48027a5b99943905cf7cfa6d622d6a9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Nov 2017 12:50:38 +0100 Subject: netfilter: xt_connlimit: remove mask argument Instead of passing mask to all the helpers, just fixup the search key early. After rbtree conversion, each rbtree node stores connections of same 'addr & mask', so no need to pass the mask too. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connlimit.c | 52 +++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index ce2870428631..a6214f235333 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -71,16 +71,9 @@ static inline unsigned int connlimit_iphash(__be32 addr) } static inline unsigned int -connlimit_iphash6(const union nf_inet_addr *addr, - const union nf_inet_addr *mask) +connlimit_iphash6(const union nf_inet_addr *addr) { - union nf_inet_addr res; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) - res.ip6[i] = addr->ip6[i] & mask->ip6[i]; - - return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), + return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), connlimit_rnd) % CONNLIMIT_SLOTS; } @@ -94,24 +87,13 @@ static inline bool already_closed(const struct nf_conn *conn) } static int -same_source_net(const union nf_inet_addr *addr, - const union nf_inet_addr *mask, - const union nf_inet_addr *u3, u_int8_t family) +same_source(const union nf_inet_addr *addr, + const union nf_inet_addr *u3, u_int8_t family) { - if (family == NFPROTO_IPV4) { - return ntohl(addr->ip & mask->ip) - - ntohl(u3->ip & mask->ip); - } else { - union nf_inet_addr lh, rh; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) { - lh.ip6[i] = addr->ip6[i] & mask->ip6[i]; - rh.ip6[i] = u3->ip6[i] & mask->ip6[i]; - } + if (family == NFPROTO_IPV4) + return ntohl(addr->ip) - ntohl(u3->ip); - return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)); - } + return memcmp(addr->ip6, u3->ip6, sizeof(addr->ip6)); } static bool add_hlist(struct hlist_head *head, @@ -194,7 +176,7 @@ static void tree_nodes_free(struct rb_root *root, static unsigned int count_tree(struct net *net, struct rb_root *root, const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr, const union nf_inet_addr *mask, + const union nf_inet_addr *addr, u8 family, const struct nf_conntrack_zone *zone) { struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; @@ -215,7 +197,7 @@ count_tree(struct net *net, struct rb_root *root, rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node); parent = *rbnode; - diff = same_source_net(addr, mask, &rbconn->addr, family); + diff = same_source(addr, &rbconn->addr, family); if (diff < 0) { rbnode = &((*rbnode)->rb_left); } else if (diff > 0) { @@ -282,7 +264,6 @@ static int count_them(struct net *net, struct xt_connlimit_data *data, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, - const union nf_inet_addr *mask, u_int8_t family, const struct nf_conntrack_zone *zone) { @@ -291,14 +272,14 @@ static int count_them(struct net *net, u32 hash; if (family == NFPROTO_IPV6) - hash = connlimit_iphash6(addr, mask); + hash = connlimit_iphash6(addr); else - hash = connlimit_iphash(addr->ip & mask->ip); + hash = connlimit_iphash(addr->ip); root = &data->climit_root[hash]; spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); - count = count_tree(net, root, tuple, addr, mask, family, zone); + count = count_tree(net, root, tuple, addr, family, zone); spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); @@ -329,16 +310,23 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int i; + memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? &iph->daddr : &iph->saddr, sizeof(addr.ip6)); + + for (i = 0; i < ARRAY_SIZE(addr.ip6); ++i) + addr.ip6[i] &= info->mask.ip6[i]; } else { const struct iphdr *iph = ip_hdr(skb); addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ? iph->daddr : iph->saddr; + + addr.ip &= info->mask.ip; } connections = count_them(net, info->data, tuple_ptr, &addr, - &info->mask, xt_family(par), zone); + xt_family(par), zone); if (connections == 0) /* kmalloc failed, drop it entirely */ goto hotdrop; -- cgit v1.2.3 From 7f4dae2d7f03d2aaf3b7d8343d4509c8d9d7ca9b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Nov 2017 19:04:47 +0100 Subject: netfilter: nft_hash: fix nft_hash_deactivate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jindřich Makovička says: The logical OR looks fishy to me. Shouldn't be && there instead? Link: https://bugzilla.netfilter.org/show_bug.cgi?id=1199 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 0fa01d772c5e..650677f1e539 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -494,7 +494,7 @@ static void *nft_hash_deactivate(const struct net *net, hash = reciprocal_scale(hash, priv->buckets); hlist_for_each_entry(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&this->ext), &elem->key.val, - set->klen) || + set->klen) && nft_set_elem_active(&he->ext, genmask)) { nft_set_elem_change_active(net, set, &he->ext); return he; -- cgit v1.2.3 From 5caaed151a68ae36aca2981cc245f5960a0a7603 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Nov 2017 19:41:09 +0100 Subject: netfilter: conntrack: don't cache nlattr_tuple_size result in nla_size We currently call ->nlattr_tuple_size() once at register time and cache result in l4proto->nla_size. nla_size is the only member that is written to, avoiding this would allow to make l4proto trackers const. We can use ->nlattr_tuple_size() at run time, and cache result in the individual trackers instead. This is an intermediate step, next patch removes nlattr_size() callback and computes size at compile time, then removes nla_size. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 9 +++++++-- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 9 +++++++-- net/netfilter/nf_conntrack_core.c | 9 +++++++-- net/netfilter/nf_conntrack_netlink.c | 10 +++++++--- net/netfilter/nf_conntrack_proto.c | 2 -- net/netfilter/nf_conntrack_proto_tcp.c | 9 +++++++-- 6 files changed, 35 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 8969420cecc3..1849fedd9b81 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -258,9 +258,14 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[], return 0; } -static int icmp_nlattr_tuple_size(void) +static unsigned int icmp_nlattr_tuple_size(void) { - return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index dca921df28e1..3ac0d826afc4 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -259,9 +259,14 @@ static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], return 0; } -static int icmpv6_nlattr_tuple_size(void) +static unsigned int icmpv6_nlattr_tuple_size(void) { - return nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 28e675150853..0e516947c16f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1563,9 +1563,14 @@ int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); -int nf_ct_port_nlattr_tuple_size(void) +unsigned int nf_ct_port_nlattr_tuple_size(void) { - return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + + return size; } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index de4053d84364..6e0adfefb9ed 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -533,11 +533,11 @@ nla_put_failure: return -1; } -static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) +static size_t ctnetlink_proto_size(const struct nf_conn *ct) { const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; - size_t len; + size_t len, len4 = 0; l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); len = l3proto->nla_size; @@ -545,8 +545,12 @@ static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); len += l4proto->nla_size; + if (l4proto->nlattr_tuple_size) { + len4 = l4proto->nlattr_tuple_size(); + len4 *= 3u; /* ORIG, REPLY, MASTER */ + } - return len; + return len + len4; } static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 83f739e9dc08..3b06ff3f2dee 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -398,8 +398,6 @@ int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto) l4proto->nla_size = 0; if (l4proto->nlattr_size) l4proto->nla_size += l4proto->nlattr_size(); - if (l4proto->nlattr_tuple_size) - l4proto->nla_size += 3 * l4proto->nlattr_tuple_size(); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 8f283294d70f..b12fc07111d0 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1277,9 +1277,14 @@ static int tcp_nlattr_size(void) + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1); } -static int tcp_nlattr_tuple_size(void) +static unsigned int tcp_nlattr_tuple_size(void) { - return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif -- cgit v1.2.3 From 7e35ec0e8044f1ede852f55948f71a1963903219 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 3 Nov 2017 16:26:32 +0100 Subject: netfilter: conntrack: move nf_ct_netns_{get,put}() to core So we can call this from other expression that need conntrack in place to work. Signed-off-by: Pablo Neira Ayuso Acked-by: Florian Westphal --- net/netfilter/nf_conntrack_proto.c | 37 ++++++++++++++++++++++++++++++++++-- net/netfilter/nft_ct.c | 39 +++----------------------------------- 2 files changed, 38 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 3b06ff3f2dee..c8e9c9503a08 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -172,7 +172,7 @@ void nf_ct_l3proto_module_put(unsigned short l3proto) } EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); -int nf_ct_netns_get(struct net *net, u8 nfproto) +static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { const struct nf_conntrack_l3proto *l3proto; int ret; @@ -197,9 +197,33 @@ int nf_ct_netns_get(struct net *net, u8 nfproto) return ret; } + +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + int err; + + if (nfproto == NFPROTO_INET) { + err = nf_ct_netns_do_get(net, NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_netns_do_get(net, NFPROTO_IPV6); + if (err < 0) + goto err2; + } else { + err = nf_ct_netns_do_get(net, nfproto); + if (err < 0) + goto err1; + } + return 0; + +err2: + nf_ct_netns_put(net, NFPROTO_IPV4); +err1: + return err; +} EXPORT_SYMBOL_GPL(nf_ct_netns_get); -void nf_ct_netns_put(struct net *net, u8 nfproto) +static void nf_ct_netns_do_put(struct net *net, u8 nfproto) { const struct nf_conntrack_l3proto *l3proto; @@ -218,6 +242,15 @@ void nf_ct_netns_put(struct net *net, u8 nfproto) nf_ct_l3proto_module_put(nfproto); } + +void nf_ct_netns_put(struct net *net, uint8_t nfproto) +{ + if (nfproto == NFPROTO_INET) { + nf_ct_netns_do_put(net, NFPROTO_IPV4); + nf_ct_netns_do_put(net, NFPROTO_IPV6); + } else + nf_ct_netns_do_put(net, nfproto); +} EXPORT_SYMBOL_GPL(nf_ct_netns_put); const struct nf_conntrack_l4proto * diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index bd0975d7dd6f..2647b895f4b0 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -312,39 +312,6 @@ static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_SREG] = { .type = NLA_U32 }, }; -static int nft_ct_netns_get(struct net *net, uint8_t family) -{ - int err; - - if (family == NFPROTO_INET) { - err = nf_ct_netns_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_get(net, family); - if (err < 0) - goto err1; - } - return 0; - -err2: - nf_ct_netns_put(net, NFPROTO_IPV4); -err1: - return err; -} - -static void nft_ct_netns_put(struct net *net, uint8_t family) -{ - if (family == NFPROTO_INET) { - nf_ct_netns_put(net, NFPROTO_IPV4); - nf_ct_netns_put(net, NFPROTO_IPV6); - } else - nf_ct_netns_put(net, family); -} - #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_tmpl_put_pcpu(void) { @@ -489,7 +456,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_ct_netns_get(ctx->net, ctx->afi->family); + err = nf_ct_netns_get(ctx->net, ctx->afi->family); if (err < 0) return err; @@ -583,7 +550,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, if (err < 0) goto err1; - err = nft_ct_netns_get(ctx->net, ctx->afi->family); + err = nf_ct_netns_get(ctx->net, ctx->afi->family); if (err < 0) goto err1; @@ -606,7 +573,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv = nft_expr_priv(expr); __nft_ct_set_destroy(ctx, priv); - nft_ct_netns_put(ctx->net, ctx->afi->family); + nf_ct_netns_put(ctx->net, ctx->afi->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) -- cgit v1.2.3 From 0984d427c1d3cb2aa882c9ad9e787ba973cf2915 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 2 Nov 2017 16:16:07 +0100 Subject: netfilter: conntrack: use power efficient workqueue conntrack uses the bounded system_long_wq workqueue for its works that don't have to run on the cpu they have been queued. Using bounded workqueue prevents the scheduler to make smart decision about the best place to schedule the work. This patch replaces system_long_wq with system_power_efficient_wq. the work stays bounded to a cpu by default unless the CONFIG_WQ_POWER_EFFICIENT is enable. In the latter case, the work can be scheduled on the best cpu from a power or a performance point of view. Signed-off-by: Vincent Guittot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0e516947c16f..5749fcaa2770 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1083,7 +1083,7 @@ static void gc_worker(struct work_struct *work) next_run = gc_work->next_gc_run; gc_work->last_bucket = i; gc_work->early_drop = false; - queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); + queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) @@ -2089,7 +2089,7 @@ int nf_conntrack_init_start(void) goto err_proto; conntrack_gc_work_init(&conntrack_gc_work); - queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ); + queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); return 0; -- cgit v1.2.3 From 644e334eeec01a25138b62ebd576b3a798183c7c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 6 Nov 2017 05:57:13 +0100 Subject: netfilter: nf_tables: performance set policy skips size description in selection Use the complexity and space notations if policy is performance, this results in placing the bitmap set representation over the hashtable for key <= 16 for better performance as we discussed during the last NFWS in Faro, Portugal. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 929927171426..3b4a0739ee39 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2549,14 +2549,9 @@ nft_select_set_ops(const struct nft_ctx *ctx, case NFT_SET_POL_PERFORMANCE: if (est.lookup < best.lookup) break; - if (est.lookup == best.lookup) { - if (!desc->size) { - if (est.space < best.space) - break; - } else if (est.size < best.size) { - break; - } - } + if (est.lookup == best.lookup && + est.space < best.space) + break; continue; case NFT_SET_POL_MEMORY: if (!desc->size) { -- cgit v1.2.3 From ba0e4d9917b43dfa746cbbcb4477da59aae73bd6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 9 Oct 2017 19:52:28 +0200 Subject: netfilter: nf_tables: get set elements via netlink This patch adds a new get operation to look up for specific elements in a set via netlink interface. You can also use it to check if an interval already exists. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 184 +++++++++++++++++++++++++++++------------ net/netfilter/nft_set_bitmap.c | 18 ++++ net/netfilter/nft_set_hash.c | 39 +++++++++ net/netfilter/nft_set_rbtree.c | 73 ++++++++++++++++ 4 files changed, 259 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3b4a0739ee39..1d66be0d8ef7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3586,45 +3586,6 @@ static int nf_tables_dump_set_done(struct netlink_callback *cb) return 0; } -static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) -{ - u8 genmask = nft_genmask_cur(net); - const struct nft_set *set; - struct nft_ctx ctx; - int err; - - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); - if (err < 0) - return err; - - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); - if (IS_ERR(set)) - return PTR_ERR(set); - - if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .dump = nf_tables_dump_set, - .done = nf_tables_dump_set_done, - }; - struct nft_set_dump_ctx *dump_ctx; - - dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); - if (!dump_ctx) - return -ENOMEM; - - dump_ctx->set = set; - dump_ctx->ctx = ctx; - - c.data = dump_ctx; - return netlink_dump_start(nlsk, skb, nlh, &c); - } - return -EOPNOTSUPP; -} - static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, @@ -3670,6 +3631,135 @@ nla_put_failure: return -1; } +static int nft_setelem_parse_flags(const struct nft_set *set, + const struct nlattr *attr, u32 *flags) +{ + if (attr == NULL) + return 0; + + *flags = ntohl(nla_get_be32(attr)); + if (*flags & ~NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + if (!(set->flags & NFT_SET_INTERVAL) && + *flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + + return 0; +} + +static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr *attr) +{ + struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + const struct nft_set_ext *ext; + struct nft_data_desc desc; + struct nft_set_elem elem; + struct sk_buff *skb; + uint32_t flags = 0; + void *priv; + int err; + + err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, + nft_set_elem_policy, NULL); + if (err < 0) + return err; + + if (!nla[NFTA_SET_ELEM_KEY]) + return -EINVAL; + + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + return err; + + err = -EINVAL; + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) + return err; + + priv = set->ops->get(ctx->net, set, &elem, flags); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + elem.priv = priv; + ext = nft_set_elem_ext(set, &elem); + + err = -ENOMEM; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err1; + + err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, + NFT_MSG_NEWSETELEM, 0, set, &elem); + if (err < 0) + goto err2; + + err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT); + /* This avoids a loop in nfnetlink. */ + if (err < 0) + goto err1; + + return 0; +err2: + kfree_skb(skb); +err1: + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; +} + +static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) +{ + u8 genmask = nft_genmask_cur(net); + struct nft_set *set; + struct nlattr *attr; + struct nft_ctx ctx; + int rem, err = 0; + + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); + if (err < 0) + return err; + + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); + if (IS_ERR(set)) + return PTR_ERR(set); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_set, + .done = nf_tables_dump_set_done, + }; + struct nft_set_dump_ctx *dump_ctx; + + dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); + if (!dump_ctx) + return -ENOMEM; + + dump_ctx->set = set; + dump_ctx->ctx = ctx; + + c.data = dump_ctx; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) + return -EINVAL; + + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + err = nft_get_set_elem(&ctx, set, attr); + if (err < 0) + break; + } + + return err; +} + static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_set_elem *elem, @@ -3770,22 +3860,6 @@ static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem) kfree(elem); } -static int nft_setelem_parse_flags(const struct nft_set *set, - const struct nlattr *attr, u32 *flags) -{ - if (attr == NULL) - return 0; - - *flags = ntohl(nla_get_be32(attr)); - if (*flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (!(set->flags & NFT_SET_INTERVAL) && - *flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - - return 0; -} - static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 734989c40579..45fb2752fb63 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -106,6 +106,23 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, return NULL; } +static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + const struct nft_bitmap *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + struct nft_bitmap_elem *be; + + list_for_each_entry_rcu(be, &priv->list, head) { + if (memcmp(nft_set_ext_key(&be->ext), elem->key.val.data, set->klen) || + !nft_set_elem_active(&be->ext, genmask)) + continue; + + return be; + } + return ERR_PTR(-ENOENT); +} + static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext) @@ -294,6 +311,7 @@ static struct nft_set_ops nft_bitmap_ops __read_mostly = { .activate = nft_bitmap_activate, .lookup = nft_bitmap_lookup, .walk = nft_bitmap_walk, + .get = nft_bitmap_get, }; static struct nft_set_type nft_bitmap_type __read_mostly = { diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 650677f1e539..c68a7e0fcf1e 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -95,6 +95,24 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, return !!he; } +static void *nft_rhash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he; + struct nft_rhash_cmp_arg arg = { + .genmask = nft_genmask_cur(net), + .set = set, + .key = elem->key.val.data, + }; + + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + if (he != NULL) + return he; + + return ERR_PTR(-ENOENT); +} + static bool nft_rhash_update(struct nft_set *set, const u32 *key, void *(*new)(struct nft_set *, const struct nft_expr *, @@ -409,6 +427,24 @@ static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, return false; } +static void *nft_hash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + struct nft_hash *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + struct nft_hash_elem *he; + u32 hash; + + hash = jhash(elem->key.val.data, set->klen, priv->seed); + hash = reciprocal_scale(hash, priv->buckets); + hlist_for_each_entry_rcu(he, &priv->table[hash], node) { + if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) && + nft_set_elem_active(&he->ext, genmask)) + return he; + } + return ERR_PTR(-ENOENT); +} + /* nft_hash_select_ops() makes sure key size can be either 2 or 4 bytes . */ static inline u32 nft_hash_key(const u32 *key, u32 klen) { @@ -600,6 +636,7 @@ static struct nft_set_ops nft_rhash_ops __read_mostly = { .lookup = nft_rhash_lookup, .update = nft_rhash_update, .walk = nft_rhash_walk, + .get = nft_rhash_get, .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, }; @@ -617,6 +654,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = { .remove = nft_hash_remove, .lookup = nft_hash_lookup, .walk = nft_hash_walk, + .get = nft_hash_get, .features = NFT_SET_MAP | NFT_SET_OBJECT, }; @@ -634,6 +672,7 @@ static struct nft_set_ops nft_hash_fast_ops __read_mostly = { .remove = nft_hash_remove, .lookup = nft_hash_lookup_fast, .walk = nft_hash_walk, + .get = nft_hash_get, .features = NFT_SET_MAP | NFT_SET_OBJECT, }; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index d83a4ec5900d..e6f08bc5f359 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -113,6 +113,78 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, return ret; } +static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, + const u32 *key, struct nft_rbtree_elem **elem, + unsigned int seq, unsigned int flags, u8 genmask) +{ + struct nft_rbtree_elem *rbe, *interval = NULL; + struct nft_rbtree *priv = nft_set_priv(set); + const struct rb_node *parent; + const void *this; + int d; + + parent = rcu_dereference_raw(priv->root.rb_node); + while (parent != NULL) { + if (read_seqcount_retry(&priv->count, seq)) + return false; + + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + + this = nft_set_ext_key(&rbe->ext); + d = memcmp(this, key, set->klen); + if (d < 0) { + parent = rcu_dereference_raw(parent->rb_left); + interval = rbe; + } else if (d > 0) { + parent = rcu_dereference_raw(parent->rb_right); + } else { + if (!nft_set_elem_active(&rbe->ext, genmask)) + parent = rcu_dereference_raw(parent->rb_left); + + if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || + (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == + (flags & NFT_SET_ELEM_INTERVAL_END)) { + *elem = rbe; + return true; + } + return false; + } + } + + if (set->flags & NFT_SET_INTERVAL && interval != NULL && + nft_set_elem_active(&interval->ext, genmask) && + !nft_rbtree_interval_end(interval)) { + *elem = interval; + return true; + } + + return false; +} + +static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + struct nft_rbtree *priv = nft_set_priv(set); + unsigned int seq = read_seqcount_begin(&priv->count); + struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); + const u32 *key = (const u32 *)&elem->key.val; + u8 genmask = nft_genmask_cur(net); + bool ret; + + ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); + if (ret || !read_seqcount_retry(&priv->count, seq)) + return rbe; + + read_lock_bh(&priv->lock); + seq = read_seqcount_begin(&priv->count); + ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); + if (!ret) + rbe = ERR_PTR(-ENOENT); + read_unlock_bh(&priv->lock); + + return rbe; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) @@ -336,6 +408,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = { .activate = nft_rbtree_activate, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, + .get = nft_rbtree_get, .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, }; -- cgit v1.2.3 From fffcefe967a02997be7a296a4f0766b29dcd1a67 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Nov 2017 14:13:29 -0800 Subject: ipv6: addrconf: fix a lockdep splat Fixes a case where GFP_ATOMIC allocation must be used instead of GFP_KERNEL one. [ 54.891146] lock_acquire+0xb3/0x2f0 [ 54.891153] ? fs_reclaim_acquire.part.60+0x5/0x30 [ 54.891165] fs_reclaim_acquire.part.60+0x29/0x30 [ 54.891170] ? fs_reclaim_acquire.part.60+0x5/0x30 [ 54.891178] kmem_cache_alloc_trace+0x3f/0x500 [ 54.891186] ? cyc2ns_read_end+0x1e/0x30 [ 54.891196] ipv6_add_addr+0x15a/0xc30 [ 54.891217] ? ipv6_create_tempaddr+0x2ea/0x5d0 [ 54.891223] ipv6_create_tempaddr+0x2ea/0x5d0 [ 54.891238] ? manage_tempaddrs+0x195/0x220 [ 54.891249] ? addrconf_prefix_rcv_add_addr+0x1c0/0x4f0 [ 54.891255] addrconf_prefix_rcv_add_addr+0x1c0/0x4f0 [ 54.891268] addrconf_prefix_rcv+0x2e5/0x9b0 [ 54.891279] ? neigh_update+0x446/0xb90 [ 54.891298] ? ndisc_router_discovery+0x5ab/0xf00 [ 54.891303] ndisc_router_discovery+0x5ab/0xf00 [ 54.891311] ? retint_kernel+0x2d/0x2d [ 54.891331] ndisc_rcv+0x1b6/0x270 [ 54.891340] icmpv6_rcv+0x6aa/0x9f0 [ 54.891345] ? ipv6_chk_mcast_addr+0x176/0x530 [ 54.891351] ? do_csum+0x17b/0x260 [ 54.891360] ip6_input_finish+0x194/0xb20 [ 54.891372] ip6_input+0x5b/0x2c0 [ 54.891380] ? ip6_rcv_finish+0x320/0x320 [ 54.891389] ip6_mc_input+0x15a/0x250 [ 54.891396] ipv6_rcv+0x772/0x1050 [ 54.891403] ? consume_skb+0xbe/0x2d0 [ 54.891412] ? ip6_make_skb+0x2a0/0x2a0 [ 54.891418] ? ip6_input+0x2c0/0x2c0 [ 54.891425] __netif_receive_skb_core+0xa0f/0x1600 [ 54.891436] ? process_backlog+0xac/0x400 [ 54.891441] process_backlog+0xfa/0x400 [ 54.891448] ? net_rx_action+0x145/0x1130 [ 54.891456] net_rx_action+0x310/0x1130 [ 54.891524] ? RTUSBBulkReceive+0x11d/0x190 [mt7610u_sta] [ 54.891538] __do_softirq+0x140/0xaba [ 54.891553] irq_exit+0x10b/0x160 [ 54.891561] do_IRQ+0xbb/0x1b0 Fixes: f3d9832e56c4 ("ipv6: addrconf: cleanup locking in ipv6_add_addr") Signed-off-by: Eric Dumazet Reported-by: Valdis Kletnieks Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 66d8c3d912fd..6233e06fa35c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1267,7 +1267,9 @@ out: in6_ifa_put(ifp); } -static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, + struct inet6_ifaddr *ift, + bool block) { struct inet6_dev *idev = ifp->idev; struct in6_addr addr, *tmpaddr; @@ -1371,7 +1373,7 @@ retry: ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft, true, NULL); + tmp_valid_lft, tmp_prefered_lft, block, NULL); if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -1956,7 +1958,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) if (ifpub) { in6_ifa_hold(ifpub); spin_unlock_bh(&ifp->lock); - ipv6_create_tempaddr(ifpub, ifp); + ipv6_create_tempaddr(ifpub, ifp, true); in6_ifa_put(ifpub); } else { spin_unlock_bh(&ifp->lock); @@ -2456,7 +2458,7 @@ static void manage_tempaddrs(struct inet6_dev *idev, * no temporary address currently exists. */ read_unlock_bh(&idev->lock); - ipv6_create_tempaddr(ifp, NULL); + ipv6_create_tempaddr(ifp, NULL, false); } else { read_unlock_bh(&idev->lock); } @@ -4351,7 +4353,7 @@ restart: spin_lock(&ifpub->lock); ifpub->regen_count = 0; spin_unlock(&ifpub->lock); - ipv6_create_tempaddr(ifpub, ifp); + ipv6_create_tempaddr(ifpub, ifp, true); in6_ifa_put(ifpub); in6_ifa_put(ifp); goto restart; -- cgit v1.2.3 From da36e6dbf478cb67a5df0da6afcd78df226e4c64 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 16 Oct 2017 14:40:21 +0100 Subject: sunrcp: make function _svc_create_xprt static The function _svc_create_xprt is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol '_svc_create_xprt' was not declared. Should it be static? Signed-off-by: Colin Ian King Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d16a8b423c20..18e87791350f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -250,9 +250,9 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) svc_xprt_received(new); } -int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name, - struct net *net, const int family, - const unsigned short port, int flags) +static int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name, + struct net *net, const int family, + const unsigned short port, int flags) { struct svc_xprt_class *xcl; -- cgit v1.2.3 From 0bad47cada5defba13e98827d22d06f13258dfb3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 16 Oct 2017 12:14:33 -0400 Subject: svcrdma: Preserve CB send buffer across retransmits During each NFSv4 callback Call, an RDMA Send completion frees the page that contains the RPC Call message. If the upper layer determines that a retransmit is necessary, this is too soon. One possible symptom: after a GARBAGE_ARGS response an NFSv4.1 callback request, the following BUG fires on the NFS server: kernel: BUG: Bad page state in process kworker/0:2H pfn:7d3ce2 kernel: page:ffffea001f4f3880 count:-2 mapcount:0 mapping: (null) index:0x0 kernel: flags: 0x2fffff80000000() kernel: raw: 002fffff80000000 0000000000000000 0000000000000000 fffffffeffffffff kernel: raw: dead000000000100 dead000000000200 0000000000000000 0000000000000000 kernel: page dumped because: nonzero _refcount kernel: Modules linked in: cts rpcsec_gss_krb5 ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue rpcrdm a ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pc lmul crc32_pclmul ghash_clmulni_intel pcbc iTCO_wdt iTCO_vendor_support aesni_intel crypto_simd glue_helper cryptd pcspkr lpc_ich i2c_i801 mei_me mf d_core mei raid0 sg wmi ioatdma ipmi_si ipmi_devintf ipmi_msghandler shpchp acpi_power_meter acpi_pad nfsd nfs_acl lockd auth_rpcgss grace sunrpc ip_tables xfs libcrc32c mlx4_en mlx4_ib mlx5_ib ib_core sd_mod sr_mod cdrom ast drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm ahci crc32c_intel libahci drm mlx5_core igb libata mlx4_core dca i2c_algo_bit i2c_core nvme kernel: ptp nvme_core pps_core dm_mirror dm_region_hash dm_log dm_mod dax kernel: CPU: 0 PID: 11495 Comm: kworker/0:2H Not tainted 4.14.0-rc3-00001-g577ce48 #811 kernel: Hardware name: Supermicro Super Server/X10SRL-F, BIOS 1.0c 09/09/2015 kernel: Workqueue: ib-comp-wq ib_cq_poll_work [ib_core] kernel: Call Trace: kernel: dump_stack+0x62/0x80 kernel: bad_page+0xfe/0x11a kernel: free_pages_check_bad+0x76/0x78 kernel: free_pcppages_bulk+0x364/0x441 kernel: ? ttwu_do_activate.isra.61+0x71/0x78 kernel: free_hot_cold_page+0x1c5/0x202 kernel: __put_page+0x2c/0x36 kernel: svc_rdma_put_context+0xd9/0xe4 [rpcrdma] kernel: svc_rdma_wc_send+0x50/0x98 [rpcrdma] This issue exists all the way back to v4.5, but refactoring and code re-organization prevents this simple patch from applying to kernels older than v4.12. The fix is the same, however, if someone needs to backport it. Reported-by: Ben Coddington BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=314 Fixes: 5d252f90a800 ('svcrdma: Add class for RDMA backwards ... ') Cc: stable@vger.kernel.org # v4.12 Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index ec37ad83b068..1854db227742 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -132,6 +132,10 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, if (ret) goto out_err; + /* Bump page refcnt so Send completion doesn't release + * the rq_buffer before all retransmits are complete. + */ + get_page(virt_to_page(rqst->rq_buffer)); ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0); if (ret) goto out_unmap; @@ -164,7 +168,6 @@ xprt_rdma_bc_allocate(struct rpc_task *task) return -EINVAL; } - /* svc_rdma_sendto releases this page */ page = alloc_page(RPCRDMA_DEF_GFP); if (!page) return -ENOMEM; @@ -183,6 +186,7 @@ xprt_rdma_bc_free(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; + put_page(virt_to_page(rqst->rq_buffer)); kfree(rqst->rq_rbuffer); } -- cgit v1.2.3 From 1754eb2b27d7a58e5b9038c6297a1e7bbff4ed52 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 24 Oct 2017 14:58:11 -0400 Subject: rpc: remove some BUG()s It would be kinder to WARN() and recover in several spots here instead of BUG()ing. Also, it looks like the read_u32_from_xdr_buf() call could actually fail, though it might require a broken (or malicious) client, so convert that to just an error return. Reported-by: Weston Andros Adamson Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 7b1ee5a0b03c..73165e9ca5bf 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -855,11 +855,13 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g return stat; if (integ_len > buf->len) return stat; - if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) - BUG(); + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) { + WARN_ON_ONCE(1); + return stat; + } /* copy out mic... */ if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) - BUG(); + return stat; if (mic.len > RPC_MAX_AUTH_SIZE) return stat; mic.data = kmalloc(mic.len, GFP_KERNEL); @@ -1611,8 +1613,10 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) BUG_ON(integ_len % 4); *p++ = htonl(integ_len); *p++ = htonl(gc->gc_seq); - if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) - BUG(); + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) { + WARN_ON_ONCE(1); + goto out_err; + } if (resbuf->tail[0].iov_base == NULL) { if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto out_err; -- cgit v1.2.3 From 77a08867a66796f8316449e030e0bfc84f2a3f66 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Oct 2017 10:49:51 -0400 Subject: svcrdma: Enqueue after setting XPT_CLOSE in completion handlers I noticed the server was sometimes not closing the connection after a flushed Send. For example, if the client responds with an RNR NAK to a Reply from the server, that client might be deadlocked, and thus wouldn't send any more traffic. Thus the server wouldn't have any opportunity to notice the XPT_CLOSE bit has been set. Enqueue the transport so that svcxprt notices the bit even if there is no more transport activity after a flushed completion, QP access error, or device removal event. Signed-off-by: Chuck Lever Reviewed-By: Devesh Sharma Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 5caf8e722a11..46ec069150d5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -290,6 +290,7 @@ static void qp_event_handler(struct ib_event *event, void *context) ib_event_msg(event->event), event->event, event->element.qp); set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); break; } } @@ -322,8 +323,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) goto out; - svc_xprt_enqueue(&xprt->sc_xprt); - goto out; + goto out_enqueue; flushed: if (wc->status != IB_WC_WR_FLUSH_ERR) @@ -333,6 +333,8 @@ flushed: set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); svc_rdma_put_context(ctxt, 1); +out_enqueue: + svc_xprt_enqueue(&xprt->sc_xprt); out: svc_xprt_put(&xprt->sc_xprt); } @@ -358,6 +360,7 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) if (unlikely(wc->status != IB_WC_SUCCESS)) { set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_xprt_enqueue(&xprt->sc_xprt); if (wc->status != IB_WC_WR_FLUSH_ERR) pr_err("svcrdma: Send: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), @@ -569,8 +572,10 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, case RDMA_CM_EVENT_DEVICE_REMOVAL: dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n", xprt, cma_id); - if (xprt) + if (xprt) { set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_xprt_enqueue(&xprt->sc_xprt); + } break; default: -- cgit v1.2.3 From 22700f3c6df55387cec2ee27c533a7b23c76dc51 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Oct 2017 17:31:43 -0400 Subject: SUNRPC: Improve ordering of transport processing Since it can take a while before a specific thread gets scheduled, it is better to just implement a first come first served queue mechanism. That way, if a thread is already scheduled and is idle, it can pick up the work to do from the queue. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 100 +++++++++++++++----------------------------------- 1 file changed, 30 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 18e87791350f..80112c45aad1 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -380,7 +380,6 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) struct svc_pool *pool; struct svc_rqst *rqstp = NULL; int cpu; - bool queued = false; if (!svc_xprt_has_something_to_do(xprt)) goto out; @@ -401,58 +400,25 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) atomic_long_inc(&pool->sp_stats.packets); -redo_search: + dprintk("svc: transport %p put into queue\n", xprt); + spin_lock_bh(&pool->sp_lock); + list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + pool->sp_stats.sockets_queued++; + spin_unlock_bh(&pool->sp_lock); + /* find a thread for this xprt */ rcu_read_lock(); list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { - /* Do a lockless check first */ - if (test_bit(RQ_BUSY, &rqstp->rq_flags)) + if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) continue; - - /* - * Once the xprt has been queued, it can only be dequeued by - * the task that intends to service it. All we can do at that - * point is to try to wake this thread back up so that it can - * do so. - */ - if (!queued) { - spin_lock_bh(&rqstp->rq_lock); - if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) { - /* already busy, move on... */ - spin_unlock_bh(&rqstp->rq_lock); - continue; - } - - /* this one will do */ - rqstp->rq_xprt = xprt; - svc_xprt_get(xprt); - spin_unlock_bh(&rqstp->rq_lock); - } - rcu_read_unlock(); - atomic_long_inc(&pool->sp_stats.threads_woken); wake_up_process(rqstp->rq_task); - put_cpu(); - goto out; - } - rcu_read_unlock(); - - /* - * We didn't find an idle thread to use, so we need to queue the xprt. - * Do so and then search again. If we find one, we can't hook this one - * up to it directly but we can wake the thread up in the hopes that it - * will pick it up once it searches for a xprt to service. - */ - if (!queued) { - queued = true; - dprintk("svc: transport %p put into queue\n", xprt); - spin_lock_bh(&pool->sp_lock); - list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); - pool->sp_stats.sockets_queued++; - spin_unlock_bh(&pool->sp_lock); - goto redo_search; + goto out_unlock; } + set_bit(SP_CONGESTED, &pool->sp_flags); rqstp = NULL; +out_unlock: + rcu_read_unlock(); put_cpu(); out: trace_svc_xprt_do_enqueue(xprt, rqstp); @@ -721,38 +687,25 @@ rqst_should_sleep(struct svc_rqst *rqstp) static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) { - struct svc_xprt *xprt; struct svc_pool *pool = rqstp->rq_pool; long time_left = 0; /* rq_xprt should be clear on entry */ WARN_ON_ONCE(rqstp->rq_xprt); - /* Normally we will wait up to 5 seconds for any required - * cache information to be provided. - */ - rqstp->rq_chandle.thread_wait = 5*HZ; - - xprt = svc_xprt_dequeue(pool); - if (xprt) { - rqstp->rq_xprt = xprt; - - /* As there is a shortage of threads and this request - * had to be queued, don't allow the thread to wait so - * long for cache updates. - */ - rqstp->rq_chandle.thread_wait = 1*HZ; - clear_bit(SP_TASK_PENDING, &pool->sp_flags); - return xprt; - } + rqstp->rq_xprt = svc_xprt_dequeue(pool); + if (rqstp->rq_xprt) + goto out_found; /* * We have to be able to interrupt this wait * to bring down the daemons ... */ set_current_state(TASK_INTERRUPTIBLE); + smp_mb__before_atomic(); + clear_bit(SP_CONGESTED, &pool->sp_flags); clear_bit(RQ_BUSY, &rqstp->rq_flags); - smp_mb(); + smp_mb__after_atomic(); if (likely(rqst_should_sleep(rqstp))) time_left = schedule_timeout(timeout); @@ -761,13 +714,11 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) try_to_freeze(); - spin_lock_bh(&rqstp->rq_lock); set_bit(RQ_BUSY, &rqstp->rq_flags); - spin_unlock_bh(&rqstp->rq_lock); - - xprt = rqstp->rq_xprt; - if (xprt != NULL) - return xprt; + smp_mb__after_atomic(); + rqstp->rq_xprt = svc_xprt_dequeue(pool); + if (rqstp->rq_xprt) + goto out_found; if (!time_left) atomic_long_inc(&pool->sp_stats.threads_timedout); @@ -775,6 +726,15 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) if (signalled() || kthread_should_stop()) return ERR_PTR(-EINTR); return ERR_PTR(-EAGAIN); +out_found: + /* Normally we will wait up to 5 seconds for any required + * cache information to be provided. + */ + if (!test_bit(SP_CONGESTED, &pool->sp_flags)) + rqstp->rq_chandle.thread_wait = 5*HZ; + else + rqstp->rq_chandle.thread_wait = 1*HZ; + return rqstp->rq_xprt; } static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) -- cgit v1.2.3 From 80661e7687f202514ee5eea0e74916d3c50c2606 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:22 -0800 Subject: ila: cleanup checksum diff Consolidate computing checksum diff into one function. Add get_csum_diff_iaddr that computes the checksum diff between an address argument and locator being written. get_csum_diff calls this using the destination address in the IP header as the argument. Also moved ila_init_saved_csum to be close to the checksum diff functions. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/ila_common.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index aba0998ddbfb..f1d9248d8b86 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -13,15 +13,28 @@ #include #include "ila.h" -static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) +void ila_init_saved_csum(struct ila_params *p) { - struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + if (!p->locator_match.v64) + return; + + p->csum_diff = compute_csum_diff8( + (__be32 *)&p->locator, + (__be32 *)&p->locator_match); +} +static __wsum get_csum_diff_iaddr(struct ila_addr *iaddr, struct ila_params *p) +{ if (p->locator_match.v64) return p->csum_diff; else - return compute_csum_diff8((__be32 *)&iaddr->loc, - (__be32 *)&p->locator); + return compute_csum_diff8((__be32 *)&p->locator, + (__be32 *)&iaddr->loc); +} + +static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) +{ + return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p); } static void ila_csum_do_neutral(struct ila_addr *iaddr, @@ -30,13 +43,7 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr, __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff, fval; - /* Check if checksum adjust value has been cached */ - if (p->locator_match.v64) { - diff = p->csum_diff; - } else { - diff = compute_csum_diff8((__be32 *)&p->locator, - (__be32 *)iaddr); - } + diff = get_csum_diff_iaddr(iaddr, p); fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG); @@ -134,16 +141,6 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, iaddr->loc = p->locator; } -void ila_init_saved_csum(struct ila_params *p) -{ - if (!p->locator_match.v64) - return; - - p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator, - (__be32 *)&p->locator_match); -} - static int __init ila_init(void) { int ret; -- cgit v1.2.3 From 84287bb3285634b60c55c00a1d5ed843b44fde92 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:23 -0800 Subject: ila: add checksum neutral map auto Add checksum neutral auto that performs checksum neutral mapping without using the C-bit. This is enabled by configuration of a mapping. The checksum neutral function has been split into ila_csum_do_neutral_fmt and ila_csum_do_neutral_nofmt. The former handles the C-bit and includes it in the adjustment value. The latter just sets the adjustment value on the locator diff only. Added configuration for checksum neutral map aut in ila_lwt and ila_xlat. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/ila_common.c | 65 ++++++++++++++++++++++++++++------------------- net/ipv6/ila/ila_lwt.c | 29 +++++++++++---------- net/ipv6/ila/ila_xlat.c | 10 +++++--- 3 files changed, 60 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index f1d9248d8b86..8c88ecf29b93 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -37,8 +37,8 @@ static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p); } -static void ila_csum_do_neutral(struct ila_addr *iaddr, - struct ila_params *p) +static void ila_csum_do_neutral_fmt(struct ila_addr *iaddr, + struct ila_params *p) { __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff, fval; @@ -60,13 +60,23 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr, iaddr->ident.csum_neutral ^= 1; } -static void ila_csum_adjust_transport(struct sk_buff *skb, +static void ila_csum_do_neutral_nofmt(struct ila_addr *iaddr, struct ila_params *p) { + __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff; - struct ipv6hdr *ip6h = ipv6_hdr(skb); - struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + + diff = get_csum_diff_iaddr(iaddr, p); + + *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust))); +} + +static void ila_csum_adjust_transport(struct sk_buff *skb, + struct ila_params *p) +{ size_t nhoff = sizeof(struct ipv6hdr); + struct ipv6hdr *ip6h = ipv6_hdr(skb); + __wsum diff; switch (ip6h->nexthdr) { case NEXTHDR_TCP: @@ -105,36 +115,39 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, } break; } - - /* Now change destination address */ - iaddr->loc = p->locator; } void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, - bool set_csum_neutral) + bool sir2ila) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); - /* First deal with the transport checksum */ - if (ila_csum_neutral_set(iaddr->ident)) { - /* C-bit is set in the locator indicating that this - * is a locator being translated to a SIR address. - * Perform (receiver) checksum-neutral translation. - */ - if (!set_csum_neutral) - ila_csum_do_neutral(iaddr, p); - } else { - switch (p->csum_mode) { - case ILA_CSUM_ADJUST_TRANSPORT: - ila_csum_adjust_transport(skb, p); - break; - case ILA_CSUM_NEUTRAL_MAP: - ila_csum_do_neutral(iaddr, p); - break; - case ILA_CSUM_NO_ACTION: + switch (p->csum_mode) { + case ILA_CSUM_ADJUST_TRANSPORT: + ila_csum_adjust_transport(skb, p); + break; + case ILA_CSUM_NEUTRAL_MAP: + if (sir2ila) { + if (WARN_ON(ila_csum_neutral_set(iaddr->ident))) { + /* Checksum flag should never be + * set in a formatted SIR address. + */ + break; + } + } else if (!ila_csum_neutral_set(iaddr->ident)) { + /* ILA to SIR translation and C-bit isn't + * set so we're good. + */ break; } + ila_csum_do_neutral_fmt(iaddr, p); + break; + case ILA_CSUM_NEUTRAL_MAP_AUTO: + ila_csum_do_neutral_nofmt(iaddr, p); + break; + case ILA_CSUM_NO_ACTION: + break; } /* Now change destination address */ diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 696281b4bca2..104af07d83a6 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -127,6 +127,7 @@ static int ila_build_state(struct nlattr *nla, struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; + u8 csum_mode = ILA_CSUM_NO_ACTION; int ret; if (family != AF_INET6) @@ -139,15 +140,6 @@ static int ila_build_state(struct nlattr *nla, return -EINVAL; } - iaddr = (struct ila_addr *)&cfg6->fc_dst; - - if (!ila_addr_is_ila(iaddr) || ila_csum_neutral_set(iaddr->ident)) { - /* Don't allow translation for a non-ILA address or checksum - * neutral flag to be set. - */ - return -EINVAL; - } - ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); if (ret < 0) return ret; @@ -155,6 +147,19 @@ static int ila_build_state(struct nlattr *nla, if (!tb[ILA_ATTR_LOCATOR]) return -EINVAL; + iaddr = (struct ila_addr *)&cfg6->fc_dst; + + if (tb[ILA_ATTR_CSUM_MODE]) + csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); + + if (csum_mode == ILA_CSUM_NEUTRAL_MAP && + ila_csum_neutral_set(iaddr->ident)) { + /* Don't allow translation if checksum neutral bit is + * configured and it's set in the SIR address. + */ + return -EINVAL; + } + newts = lwtunnel_state_alloc(sizeof(*ilwt)); if (!newts) return -ENOMEM; @@ -168,17 +173,13 @@ static int ila_build_state(struct nlattr *nla, p = ila_params_lwtunnel(newts); + p->csum_mode = csum_mode; p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); /* Precompute checksum difference for translation since we * know both the old locator and the new one. */ p->locator_match = iaddr->loc; - p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator_match, (__be32 *)&p->locator); - - if (tb[ILA_ATTR_CSUM_MODE]) - p->csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); ila_init_saved_csum(p); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 3123b9de91b5..213259629e66 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -138,6 +138,8 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[ILA_ATTR_CSUM_MODE]) xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); + else + xp->ip.csum_mode = ILA_CSUM_NO_ACTION; if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); @@ -198,7 +200,7 @@ static void ila_free_cb(void *ptr, void *arg) } } -static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral); +static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int ila_nf_input(void *priv, @@ -396,7 +398,7 @@ static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || - nla_put_u32(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode)) + nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode)) return -1; return 0; @@ -607,7 +609,7 @@ static struct pernet_operations ila_net_ops = { .size = sizeof(struct ila_net), }; -static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral) +static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -626,7 +628,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral) ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) - ila_update_ipv6_locator(skb, &ila->xp.ip, set_csum_neutral); + ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila); rcu_read_unlock(); -- cgit v1.2.3 From 70d5aef48a421a68bd9d1bf8f8267af406681580 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:24 -0800 Subject: ila: allow configuration of identifier type Allow identifier to be explicitly configured for a mapping. This can either be one of the identifier types specified in the ILA draft or a value of ILA_ATYPE_USE_FORMAT which means the identifier type is inferred from the identifier type field. If a value other than ILA_ATYPE_USE_FORMAT is set for a mapping then it is assumed that the identifier type field is not present in an identifier. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/ila.h | 12 +----------- net/ipv6/ila/ila_lwt.c | 51 ++++++++++++++++++++++++++++++++++++++++++------- net/ipv6/ila/ila_xlat.c | 18 ++++++++++++----- 3 files changed, 58 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index e0170f62bc39..3c7a11b62334 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -55,17 +55,6 @@ struct ila_identifier { }; }; -enum { - ILA_ATYPE_IID = 0, - ILA_ATYPE_LUID, - ILA_ATYPE_VIRT_V4, - ILA_ATYPE_VIRT_UNI_V6, - ILA_ATYPE_VIRT_MULTI_V6, - ILA_ATYPE_RSVD_1, - ILA_ATYPE_RSVD_2, - ILA_ATYPE_RSVD_3, -}; - #define CSUM_NEUTRAL_FLAG htonl(0x10000000) struct ila_addr { @@ -93,6 +82,7 @@ struct ila_params { struct ila_locator locator_match; __wsum csum_diff; u8 csum_mode; + u8 ident_type; }; static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 104af07d83a6..4b97d573f223 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -114,6 +114,7 @@ drop: static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, + [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, }; static int ila_build_state(struct nlattr *nla, @@ -127,19 +128,14 @@ static int ila_build_state(struct nlattr *nla, struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; + u8 ident_type = ILA_ATYPE_USE_FORMAT; u8 csum_mode = ILA_CSUM_NO_ACTION; + u8 eff_ident_type; int ret; if (family != AF_INET6) return -EINVAL; - if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) { - /* Need to have full locator and at least type field - * included in destination - */ - return -EINVAL; - } - ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); if (ret < 0) return ret; @@ -149,6 +145,41 @@ static int ila_build_state(struct nlattr *nla, iaddr = (struct ila_addr *)&cfg6->fc_dst; + if (tb[ILA_ATTR_IDENT_TYPE]) + ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]); + + if (ident_type == ILA_ATYPE_USE_FORMAT) { + /* Infer identifier type from type field in formatted + * identifier. + */ + + if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) { + /* Need to have full locator and at least type field + * included in destination + */ + return -EINVAL; + } + + eff_ident_type = iaddr->ident.type; + } else { + eff_ident_type = ident_type; + } + + switch (eff_ident_type) { + case ILA_ATYPE_IID: + /* Don't allow ILA for IID type */ + return -EINVAL; + case ILA_ATYPE_LUID: + break; + case ILA_ATYPE_VIRT_V4: + case ILA_ATYPE_VIRT_UNI_V6: + case ILA_ATYPE_VIRT_MULTI_V6: + case ILA_ATYPE_NONLOCAL_ADDR: + /* These ILA formats are not supported yet. */ + default: + return -EINVAL; + } + if (tb[ILA_ATTR_CSUM_MODE]) csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); @@ -174,6 +205,7 @@ static int ila_build_state(struct nlattr *nla, p = ila_params_lwtunnel(newts); p->csum_mode = csum_mode; + p->ident_type = ident_type; p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); /* Precompute checksum difference for translation since we @@ -208,9 +240,13 @@ static int ila_fill_encap_info(struct sk_buff *skb, if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, ILA_ATTR_PAD)) goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode)) goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -221,6 +257,7 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ + nla_total_size(sizeof(u8)) + /* ILA_ATTR_IDENT_TYPE */ 0; } diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 213259629e66..6eb5e68f112a 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -121,6 +121,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, + [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, }; static int parse_nl_config(struct genl_info *info, @@ -141,6 +142,12 @@ static int parse_nl_config(struct genl_info *info, else xp->ip.csum_mode = ILA_CSUM_NO_ACTION; + if (info->attrs[ILA_ATTR_IDENT_TYPE]) + xp->ip.ident_type = nla_get_u8( + info->attrs[ILA_ATTR_IDENT_TYPE]); + else + xp->ip.ident_type = ILA_ATYPE_USE_FORMAT; + if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); @@ -398,7 +405,8 @@ static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || - nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode)) + nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) || + nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type)) return -1; return 0; @@ -619,10 +627,10 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) /* Assumes skb contains a valid IPv6 header that is pulled */ - if (!ila_addr_is_ila(iaddr)) { - /* Type indicates this is not an ILA address */ - return 0; - } + /* No check here that ILA type in the mapping matches what is in the + * address. We assume that whatever sender gaves us can be translated. + * The checksum mode however is relevant. + */ rcu_read_lock(); -- cgit v1.2.3 From fddb231ebe647749782a9ebf11106a81f7168ba7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:25 -0800 Subject: ila: Add a hook type for LWT routes In LWT tunnels both an input and output route method is defined. If both of these are executed in the same path then double translation happens and the effect is not correct. This patch adds a new attribute that indicates the hook type. Two values are defined for route output and route output. ILA translation is only done for the one that is set. The default is to enable ILA on route output. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/ila_lwt.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 4b97d573f223..3d56a2fb6f86 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -20,6 +20,7 @@ struct ila_lwt { struct ila_params p; struct dst_cache dst_cache; u32 connected : 1; + u32 lwt_output : 1; }; static inline struct ila_lwt *ila_lwt_lwtunnel( @@ -45,8 +46,10 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate), - true); + if (ilwt->lwt_output) + ila_update_ipv6_locator(skb, + ila_params_lwtunnel(orig_dst->lwtstate), + true); if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) { /* Already have a next hop address in route, no need for @@ -98,11 +101,15 @@ drop: static int ila_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); + struct ila_lwt *ilwt = ila_lwt_lwtunnel(dst->lwtstate); if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false); + if (!ilwt->lwt_output) + ila_update_ipv6_locator(skb, + ila_params_lwtunnel(dst->lwtstate), + false); return dst->lwtstate->orig_input(skb); @@ -115,6 +122,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, + [ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, }, }; static int ila_build_state(struct nlattr *nla, @@ -129,7 +137,9 @@ static int ila_build_state(struct nlattr *nla, const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; u8 ident_type = ILA_ATYPE_USE_FORMAT; + u8 hook_type = ILA_HOOK_ROUTE_OUTPUT; u8 csum_mode = ILA_CSUM_NO_ACTION; + bool lwt_output = true; u8 eff_ident_type; int ret; @@ -180,6 +190,20 @@ static int ila_build_state(struct nlattr *nla, return -EINVAL; } + if (tb[ILA_ATTR_HOOK_TYPE]) + hook_type = nla_get_u8(tb[ILA_ATTR_HOOK_TYPE]); + + switch (hook_type) { + case ILA_HOOK_ROUTE_OUTPUT: + lwt_output = true; + break; + case ILA_HOOK_ROUTE_INPUT: + lwt_output = false; + break; + default: + return -EINVAL; + } + if (tb[ILA_ATTR_CSUM_MODE]) csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); @@ -202,6 +226,8 @@ static int ila_build_state(struct nlattr *nla, return ret; } + ilwt->lwt_output = !!lwt_output; + p = ila_params_lwtunnel(newts); p->csum_mode = csum_mode; @@ -236,6 +262,7 @@ static int ila_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ila_params *p = ila_params_lwtunnel(lwtstate); + struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate); if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, ILA_ATTR_PAD)) @@ -247,6 +274,11 @@ static int ila_fill_encap_info(struct sk_buff *skb, if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type)) goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE, + ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT : + ILA_HOOK_ROUTE_INPUT)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -258,6 +290,7 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_IDENT_TYPE */ + nla_total_size(sizeof(u8)) + /* ILA_ATTR_HOOK_TYPE */ 0; } -- cgit v1.2.3 From 602f3baf22188aad24b9a58be3209ab774b97d74 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 6 Nov 2017 07:23:41 +0100 Subject: net_sch: red: Add offload ability to RED qdisc Add the ability to offload RED qdisc by using ndo_setup_tc. There are four commands for RED offloading: * TC_RED_SET: handles set and change. * TC_RED_DESTROY: handle qdisc destroy. * TC_RED_STATS: update the qdiscs counters (given as reference) * TC_RED_XSTAT: returns red xstats. Whether RED is being offloaded is being determined every time dump action is being called because parent change of this qdisc could change its offload state but doesn't require any RED function to be called. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/sch_red.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'net') diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index fdfdb56aaae2..007dd8ef8aac 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -148,11 +149,37 @@ static void red_reset(struct Qdisc *sch) red_restart(&q->vars); } +static int red_offload(struct Qdisc *sch, bool enable) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_red_qopt_offload opt = { + .handle = sch->handle, + .parent = sch->parent, + }; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + if (enable) { + opt.command = TC_RED_REPLACE; + opt.set.min = q->parms.qth_min >> q->parms.Wlog; + opt.set.max = q->parms.qth_max >> q->parms.Wlog; + opt.set.probability = q->parms.max_P; + opt.set.is_ecn = red_use_ecn(q); + } else { + opt.command = TC_RED_DESTROY; + } + + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt); +} + static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); del_timer_sync(&q->adapt_timer); + red_offload(sch, false); qdisc_destroy(q->qdisc); } @@ -219,6 +246,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) red_start_of_idle_period(&q->vars); sch_tree_unlock(sch); + red_offload(sch, true); return 0; } @@ -244,6 +272,33 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt) return red_change(sch, opt); } +static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_red_qopt_offload hw_stats = { + .handle = sch->handle, + .parent = sch->parent, + .command = TC_RED_STATS, + .stats.bstats = &sch->bstats, + .stats.qstats = &sch->qstats, + }; + int err; + + opt->flags &= ~TC_RED_OFFLOADED; + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return 0; + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, + &hw_stats); + if (err == -EOPNOTSUPP) + return 0; + + if (!err) + opt->flags |= TC_RED_OFFLOADED; + + return err; +} + static int red_dump(struct Qdisc *sch, struct sk_buff *skb) { struct red_sched_data *q = qdisc_priv(sch); @@ -257,8 +312,13 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) .Plog = q->parms.Plog, .Scell_log = q->parms.Scell_log, }; + int err; sch->qstats.backlog = q->qdisc->qstats.backlog; + err = red_dump_offload(sch, &opt); + if (err) + goto nla_put_failure; + opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; @@ -275,6 +335,7 @@ nla_put_failure: static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct red_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); struct tc_red_xstats st = { .early = q->stats.prob_drop + q->stats.forced_drop, .pdrop = q->stats.pdrop, @@ -282,6 +343,24 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .marked = q->stats.prob_mark + q->stats.forced_mark, }; + if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) { + struct red_stats hw_stats = {0}; + struct tc_red_qopt_offload hw_stats_request = { + .handle = sch->handle, + .parent = sch->parent, + .command = TC_RED_XSTATS, + .xstats = &hw_stats, + }; + if (!dev->netdev_ops->ndo_setup_tc(dev, + TC_SETUP_QDISC_RED, + &hw_stats_request)) { + st.early += hw_stats.prob_drop + hw_stats.forced_drop; + st.pdrop += hw_stats.pdrop; + st.other += hw_stats.other; + st.marked += hw_stats.prob_mark + hw_stats.forced_mark; + } + } + return gnet_stats_copy_app(d, &st, sizeof(st)); } -- cgit v1.2.3 From 575ed7d39e2fbe602a3894bc766a8cb49af83bd3 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 6 Nov 2017 07:23:42 +0100 Subject: net_sch: mqprio: Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO to match the new convention. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/sch_mqprio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 4d5ed45123f0..b85885a9d8a1 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -50,7 +50,8 @@ static void mqprio_destroy(struct Qdisc *sch) switch (priv->mode) { case TC_MQPRIO_MODE_DCB: case TC_MQPRIO_MODE_CHANNEL: - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, + dev->netdev_ops->ndo_setup_tc(dev, + TC_SETUP_QDISC_MQPRIO, &mqprio); break; default: @@ -265,7 +266,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) return -EINVAL; } err = dev->netdev_ops->ndo_setup_tc(dev, - TC_SETUP_MQPRIO, + TC_SETUP_QDISC_MQPRIO, &mqprio); if (err) return err; -- cgit v1.2.3 From 8521db4c7e155d12fb280686c0552e47f77e9110 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 6 Nov 2017 07:23:43 +0100 Subject: net_sch: cbs: Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS to match the new convention.. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index bdb533b7fb8c..7a72980c1509 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -212,7 +212,7 @@ static void cbs_disable_offload(struct net_device *dev, cbs.queue = q->queue; cbs.enable = 0; - err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs); + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); if (err < 0) pr_warn("Couldn't disable CBS offload for queue %d\n", cbs.queue); @@ -236,7 +236,7 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, cbs.idleslope = opt->idleslope; cbs.sendslope = opt->sendslope; - err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs); + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); if (err < 0) return err; -- cgit v1.2.3 From 92f25cafe821f6bd7f7815128bc7584a69512f68 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Mon, 6 Nov 2017 12:42:04 +0100 Subject: net: dsa: lan9303: Adjust indenting Remove scripts/checkpatch.pl CHECKs by adjusting indenting. Signed-off-by: Egil Hjelmeland Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/tag_lan9303.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index e526c8967b98..5ba01fc3c6ba 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -88,7 +88,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) } static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt) { u16 *lan9303_tag; unsigned int source_port; -- cgit v1.2.3 From 03ac738d5cf2f97ffa1209f6fd5d4bc211a933de Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 6 Nov 2017 15:04:54 +0000 Subject: rtnetlink: fix missing size for IFLA_IF_NETNSID The size for IFLA_IF_NETNSID is missing from the size calculation because the proceeding semicolon was not removed. Fix this by removing the semicolon. Detected by CoverityScan, CID#1461135 ("Structurally dead code") Fixes: 79e1ad148c84 ("rtnetlink: use netnsid to query interface") Signed-off-by: Colin Ian King Acked-by: Jiri Benc Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dc5ad84ac096..dabba2a91fc8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -920,7 +920,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ - + nla_total_size(1); /* IFLA_PROTO_DOWN */ + + nla_total_size(1) /* IFLA_PROTO_DOWN */ + nla_total_size(4) /* IFLA_IF_NETNSID */ + 0; } -- cgit v1.2.3 From 7f5d3f2721b07ab5896526c5992edd2ab1665561 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 7 Nov 2017 11:38:32 +0100 Subject: pktgen: document 32-bit timestamp overflow Timestamps in pktgen are currently retrieved using the deprecated do_gettimeofday() function that wraps its signed 32-bit seconds in 2038 (on 32-bit architectures) and requires a division operation to calculate microseconds. The pktgen header is also defined with the same limitations, hardcoding to a 32-bit seconds field that can be interpreted as unsigned to produce times that only wrap in 2106. Whatever code reads the timestamps should be aware of that problem in general, but probably doesn't care too much as we are mostly interested in the time passing between packets, and that is correctly represented. Using 64-bit nanoseconds would be cheaper and good for 584 years. Using monotonic times would also make this unambiguous by avoiding the overflow, but would make it harder to correlate to the times with those on remote machines. Either approach would require adding a new runtime flag and implementing the same thing on the remote side, which we probably don't want to do unless someone sees it as a real problem. Also, this should be coordinated with other pktgen implementations and might need a new magic number. For the moment, I'm documenting the overflow in the source code, and changing the implementation over to an open-coded ktime_get_real_ts64() plus division, so we don't have to look at it again while scanning for deprecated time interfaces. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/core/pktgen.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index e3fa53a07d34..40db0b7e37ac 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2711,7 +2711,7 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi, static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, int datalen) { - struct timeval timestamp; + struct timespec64 timestamp; struct pktgen_hdr *pgh; pgh = skb_put(skb, sizeof(*pgh)); @@ -2773,9 +2773,17 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, pgh->tv_sec = 0; pgh->tv_usec = 0; } else { - do_gettimeofday(×tamp); + /* + * pgh->tv_sec wraps in y2106 when interpreted as unsigned + * as done by wireshark, or y2038 when interpreted as signed. + * This is probably harmless, but if anyone wants to improve + * it, we could introduce a variant that puts 64-bit nanoseconds + * into the respective header bytes. + * This would also be slightly faster to read. + */ + ktime_get_real_ts64(×tamp); pgh->tv_sec = htonl(timestamp.tv_sec); - pgh->tv_usec = htonl(timestamp.tv_usec); + pgh->tv_usec = htonl(timestamp.tv_nsec / NSEC_PER_USEC); } } -- cgit v1.2.3 From b2d0f5d5dc53532e6f07bc546a476a55ebdfe0f3 Mon Sep 17 00:00:00 2001 From: Yi Yang Date: Tue, 7 Nov 2017 21:07:02 +0800 Subject: openvswitch: enable NSH support v16->17 - Fixed disputed check code: keep them in nsh_push and nsh_pop but also add them in __ovs_nla_copy_actions v15->v16 - Add csum recalculation for nsh_push, nsh_pop and set_nsh pointed out by Pravin - Move nsh key into the union with ipv4 and ipv6 and add check for nsh key in match_validate pointed out by Pravin - Add nsh check in validate_set and __ovs_nla_copy_actions v14->v15 - Check size in nsh_hdr_from_nlattr - Fixed four small issues pointed out By Jiri and Eric v13->v14 - Rename skb_push_nsh to nsh_push per Dave's comment - Rename skb_pop_nsh to nsh_pop per Dave's comment v12->v13 - Fix NSH header length check in set_nsh v11->v12 - Fix missing changes old comments pointed out - Fix new comments for v11 v10->v11 - Fix the left three disputable comments for v9 but not fixed in v10. v9->v10 - Change struct ovs_key_nsh to struct ovs_nsh_key_base base; __be32 context[NSH_MD1_CONTEXT_SIZE]; - Fix new comments for v9 v8->v9 - Fix build error reported by daily intel build because nsh module isn't selected by openvswitch v7->v8 - Rework nested value and mask for OVS_KEY_ATTR_NSH - Change pop_nsh to adapt to nsh kernel module - Fix many issues per comments from Jiri Benc v6->v7 - Remove NSH GSO patches in v6 because Jiri Benc reworked it as another patch series and they have been merged. - Change it to adapt to nsh kernel module added by NSH GSO patch series v5->v6 - Fix the rest comments for v4. - Add NSH GSO support for VxLAN-gpe + NSH and Eth + NSH. v4->v5 - Fix many comments by Jiri Benc and Eric Garver for v4. v3->v4 - Add new NSH match field ttl - Update NSH header to the latest format which will be final format and won't change per its author's confirmation. - Fix comments for v3. v2->v3 - Change OVS_KEY_ATTR_NSH to nested key to handle length-fixed attributes and length-variable attriubte more flexibly. - Remove struct ovs_action_push_nsh completely - Add code to handle nested attribute for SET_MASKED - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH to transfer NSH header data. - Fix comments and coding style issues by Jiri and Eric v1->v2 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh - Dynamically allocate struct ovs_action_push_nsh for length-variable metadata. OVS master and 2.8 branch has merged NSH userspace patch series, this patch is to enable NSH support in kernel data path in order that OVS can support NSH in compat mode by porting this. Signed-off-by: Yi Yang Acked-by: Jiri Benc Acked-by: Eric Garver Acked-by: Pravin Shelar Signed-off-by: David S. Miller --- net/nsh/nsh.c | 60 +++++++ net/openvswitch/Kconfig | 1 + net/openvswitch/actions.c | 116 ++++++++++++++ net/openvswitch/flow.c | 51 ++++++ net/openvswitch/flow.h | 7 + net/openvswitch/flow_netlink.c | 343 ++++++++++++++++++++++++++++++++++++++++- net/openvswitch/flow_netlink.h | 5 + 7 files changed, 581 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index 58fb827439a8..d7da99a0b0b8 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -14,6 +14,66 @@ #include #include +int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh) +{ + struct nshhdr *nh; + size_t length = nsh_hdr_len(pushed_nh); + u8 next_proto; + + if (skb->mac_len) { + next_proto = TUN_P_ETHERNET; + } else { + next_proto = tun_p_from_eth_p(skb->protocol); + if (!next_proto) + return -EAFNOSUPPORT; + } + + /* Add the NSH header */ + if (skb_cow_head(skb, length) < 0) + return -ENOMEM; + + skb_push(skb, length); + nh = (struct nshhdr *)(skb->data); + memcpy(nh, pushed_nh, length); + nh->np = next_proto; + skb_postpush_rcsum(skb, nh, length); + + skb->protocol = htons(ETH_P_NSH); + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_mac_len(skb); + + return 0; +} +EXPORT_SYMBOL_GPL(nsh_push); + +int nsh_pop(struct sk_buff *skb) +{ + struct nshhdr *nh; + size_t length; + __be16 inner_proto; + + if (!pskb_may_pull(skb, NSH_BASE_HDR_LEN)) + return -ENOMEM; + nh = (struct nshhdr *)(skb->data); + length = nsh_hdr_len(nh); + inner_proto = tun_p_to_eth_p(nh->np); + if (!pskb_may_pull(skb, length)) + return -ENOMEM; + + if (!inner_proto) + return -EAFNOSUPPORT; + + skb_pull_rcsum(skb, length); + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_mac_len(skb); + skb->protocol = inner_proto; + + return 0; +} +EXPORT_SYMBOL_GPL(nsh_pop); + static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, netdev_features_t features) { diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index ce947292ae77..2650205cdaf9 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -14,6 +14,7 @@ config OPENVSWITCH select MPLS select NET_MPLS_GSO select DST_CACHE + select NET_NSH ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index a551232daf61..9a6a6d51e421 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -43,6 +43,7 @@ #include "flow.h" #include "conntrack.h" #include "vport.h" +#include "flow_netlink.h" struct deferred_action { struct sk_buff *skb; @@ -380,6 +381,38 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, return 0; } +static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key, + const struct nshhdr *nh) +{ + int err; + + err = nsh_push(skb, nh); + if (err) + return err; + + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} + +static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key) +{ + int err; + + err = nsh_pop(skb); + if (err) + return err; + + /* safe right before invalidate_flow_key */ + if (skb->protocol == htons(ETH_P_TEB)) + key->mac_proto = MAC_PROTO_ETHERNET; + else + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} + static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, __be32 addr, __be32 new_addr) { @@ -602,6 +635,69 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } +static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct nlattr *a) +{ + struct nshhdr *nh; + size_t length; + int err; + u8 flags; + u8 ttl; + int i; + + struct ovs_key_nsh key; + struct ovs_key_nsh mask; + + err = nsh_key_from_nlattr(a, &key, &mask); + if (err) + return err; + + /* Make sure the NSH base header is there */ + if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN)) + return -ENOMEM; + + nh = nsh_hdr(skb); + length = nsh_hdr_len(nh); + + /* Make sure the whole NSH header is there */ + err = skb_ensure_writable(skb, skb_network_offset(skb) + + length); + if (unlikely(err)) + return err; + + nh = nsh_hdr(skb); + skb_postpull_rcsum(skb, nh, length); + flags = nsh_get_flags(nh); + flags = OVS_MASKED(flags, key.base.flags, mask.base.flags); + flow_key->nsh.base.flags = flags; + ttl = nsh_get_ttl(nh); + ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl); + flow_key->nsh.base.ttl = ttl; + nsh_set_flags_and_ttl(nh, flags, ttl); + nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr, + mask.base.path_hdr); + flow_key->nsh.base.path_hdr = nh->path_hdr; + switch (nh->mdtype) { + case NSH_M_TYPE1: + for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) { + nh->md1.context[i] = + OVS_MASKED(nh->md1.context[i], key.context[i], + mask.context[i]); + } + memcpy(flow_key->nsh.context, nh->md1.context, + sizeof(nh->md1.context)); + break; + case NSH_M_TYPE2: + memset(flow_key->nsh.context, 0, + sizeof(flow_key->nsh.context)); + break; + default: + return -EINVAL; + } + skb_postpush_rcsum(skb, nh, length); + return 0; +} + /* Must follow skb_ensure_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) @@ -1024,6 +1120,10 @@ static int execute_masked_set_action(struct sk_buff *skb, get_mask(a, struct ovs_key_ethernet *)); break; + case OVS_KEY_ATTR_NSH: + err = set_nsh(skb, flow_key, a); + break; + case OVS_KEY_ATTR_IPV4: err = set_ipv4(skb, flow_key, nla_data(a), get_mask(a, struct ovs_key_ipv4 *)); @@ -1214,6 +1314,22 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_POP_ETH: err = pop_eth(skb, key); break; + + case OVS_ACTION_ATTR_PUSH_NSH: { + u8 buffer[NSH_HDR_MAX_LEN]; + struct nshhdr *nh = (struct nshhdr *)buffer; + + err = nsh_hdr_from_nlattr(nla_data(a), nh, + NSH_HDR_MAX_LEN); + if (unlikely(err)) + break; + err = push_nsh(skb, key, nh); + break; + } + + case OVS_ACTION_ATTR_POP_NSH: + err = pop_nsh(skb, key); + break; } if (unlikely(err)) { diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8c94cef25a72..864ddb1e3642 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "conntrack.h" #include "datapath.h" @@ -490,6 +491,52 @@ invalid: return 0; } +static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) +{ + struct nshhdr *nh; + unsigned int nh_ofs = skb_network_offset(skb); + u8 version, length; + int err; + + err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN); + if (unlikely(err)) + return err; + + nh = nsh_hdr(skb); + version = nsh_get_ver(nh); + length = nsh_hdr_len(nh); + + if (version != 0) + return -EINVAL; + + err = check_header(skb, nh_ofs + length); + if (unlikely(err)) + return err; + + nh = nsh_hdr(skb); + key->nsh.base.flags = nsh_get_flags(nh); + key->nsh.base.ttl = nsh_get_ttl(nh); + key->nsh.base.mdtype = nh->mdtype; + key->nsh.base.np = nh->np; + key->nsh.base.path_hdr = nh->path_hdr; + switch (key->nsh.base.mdtype) { + case NSH_M_TYPE1: + if (length != NSH_M_TYPE1_LEN) + return -EINVAL; + memcpy(key->nsh.context, nh->md1.context, + sizeof(nh->md1)); + break; + case NSH_M_TYPE2: + memset(key->nsh.context, 0, + sizeof(nh->md1)); + break; + default: + return -EINVAL; + } + + return 0; +} + /** * key_extract - extracts a flow key from an Ethernet frame. * @skb: sk_buff that contains the frame, with skb->data pointing to the @@ -735,6 +782,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->tp, 0, sizeof(key->tp)); } } + } else if (key->eth.type == htons(ETH_P_NSH)) { + error = parse_nsh(skb, key); + if (error) + return error; } return 0; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 1875bba4f865..c670dd24b8b7 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -35,6 +35,7 @@ #include #include #include +#include struct sk_buff; @@ -66,6 +67,11 @@ struct vlan_head { (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) +struct ovs_key_nsh { + struct ovs_nsh_key_base base; + __be32 context[NSH_MD1_CONTEXT_SIZE]; +}; + struct sw_flow_key { u8 tun_opts[IP_TUNNEL_OPTS_MAX]; u8 tun_opts_len; @@ -143,6 +149,7 @@ struct sw_flow_key { } nd; }; } ipv6; + struct ovs_key_nsh nsh; /* network service header */ }; struct { /* Connection tracking fields not packed above. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index dc0d79092e74..4201f9293af3 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include "flow_netlink.h" @@ -80,9 +81,11 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_HASH: case OVS_ACTION_ATTR_POP_ETH: case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_NSH: case OVS_ACTION_ATTR_POP_VLAN: case OVS_ACTION_ATTR_PUSH_ETH: case OVS_ACTION_ATTR_PUSH_MPLS: + case OVS_ACTION_ATTR_PUSH_NSH: case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_SET: @@ -175,7 +178,8 @@ static bool match_validate(const struct sw_flow_match *match, | (1 << OVS_KEY_ATTR_ICMPV6) | (1 << OVS_KEY_ATTR_ARP) | (1 << OVS_KEY_ATTR_ND) - | (1 << OVS_KEY_ATTR_MPLS)); + | (1 << OVS_KEY_ATTR_MPLS) + | (1 << OVS_KEY_ATTR_NSH)); /* Always allowed mask fields. */ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) @@ -284,6 +288,14 @@ static bool match_validate(const struct sw_flow_match *match, } } + if (match->key->eth.type == htons(ETH_P_NSH)) { + key_expected |= 1 << OVS_KEY_ATTR_NSH; + if (match->mask && + match->mask->key.eth.type == htons(0xffff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_NSH; + } + } + if ((key_attrs & key_expected) != key_expected) { /* Key attributes check failed. */ OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)", @@ -325,12 +337,25 @@ size_t ovs_tun_key_attr_size(void) + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ } +size_t ovs_nsh_key_attr_size(void) +{ + /* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider + * updating this function. + */ + return nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */ + /* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are + * mutually exclusive, so the bigger one can cover + * the small one. + */ + + nla_total_size(NSH_CTX_HDRS_MAX_LEN); +} + size_t ovs_key_attr_size(void) { /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -344,6 +369,8 @@ size_t ovs_key_attr_size(void) + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */ + + nla_total_size(0) /* OVS_KEY_ATTR_NSH */ + + ovs_nsh_key_attr_size() + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -377,6 +404,13 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, }; +static const struct ovs_len_tbl +ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = { + [OVS_NSH_KEY_ATTR_BASE] = { .len = sizeof(struct ovs_nsh_key_base) }, + [OVS_NSH_KEY_ATTR_MD1] = { .len = sizeof(struct ovs_nsh_key_md1) }, + [OVS_NSH_KEY_ATTR_MD2] = { .len = OVS_ATTR_VARIABLE }, +}; + /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED }, @@ -409,6 +443,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { .len = sizeof(struct ovs_key_ct_tuple_ipv4) }, [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = { .len = sizeof(struct ovs_key_ct_tuple_ipv6) }, + [OVS_KEY_ATTR_NSH] = { .len = OVS_ATTR_NESTED, + .next = ovs_nsh_key_attr_lens, }, }; static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) @@ -1227,6 +1263,221 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, return 0; } +int nsh_hdr_from_nlattr(const struct nlattr *attr, + struct nshhdr *nh, size_t size) +{ + struct nlattr *a; + int rem; + u8 flags = 0; + u8 ttl = 0; + int mdlen = 0; + + /* validate_nsh has check this, so we needn't do duplicate check here + */ + if (size < NSH_BASE_HDR_LEN) + return -ENOBUFS; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + + flags = base->flags; + ttl = base->ttl; + nh->np = base->np; + nh->mdtype = base->mdtype; + nh->path_hdr = base->path_hdr; + break; + } + case OVS_NSH_KEY_ATTR_MD1: + mdlen = nla_len(a); + if (mdlen > size - NSH_BASE_HDR_LEN) + return -ENOBUFS; + memcpy(&nh->md1, nla_data(a), mdlen); + break; + + case OVS_NSH_KEY_ATTR_MD2: + mdlen = nla_len(a); + if (mdlen > size - NSH_BASE_HDR_LEN) + return -ENOBUFS; + memcpy(&nh->md2, nla_data(a), mdlen); + break; + + default: + return -EINVAL; + } + } + + /* nsh header length = NSH_BASE_HDR_LEN + mdlen */ + nh->ver_flags_ttl_len = 0; + nsh_set_flags_ttl_len(nh, flags, ttl, NSH_BASE_HDR_LEN + mdlen); + + return 0; +} + +int nsh_key_from_nlattr(const struct nlattr *attr, + struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask) +{ + struct nlattr *a; + int rem; + + /* validate_nsh has check this, so we needn't do duplicate check here + */ + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + const struct ovs_nsh_key_base *base_mask = base + 1; + + nsh->base = *base; + nsh_mask->base = *base_mask; + break; + } + case OVS_NSH_KEY_ATTR_MD1: { + const struct ovs_nsh_key_md1 *md1 = nla_data(a); + const struct ovs_nsh_key_md1 *md1_mask = md1 + 1; + + memcpy(nsh->context, md1->context, sizeof(*md1)); + memcpy(nsh_mask->context, md1_mask->context, + sizeof(*md1_mask)); + break; + } + case OVS_NSH_KEY_ATTR_MD2: + /* Not supported yet */ + return -ENOTSUPP; + default: + return -EINVAL; + } + } + + return 0; +} + +static int nsh_key_put_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool is_push_nsh, bool log) +{ + struct nlattr *a; + int rem; + bool has_base = false; + bool has_md1 = false; + bool has_md2 = false; + u8 mdtype = 0; + int mdlen = 0; + + if (WARN_ON(is_push_nsh && is_mask)) + return -EINVAL; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int i; + + if (type > OVS_NSH_KEY_ATTR_MAX) { + OVS_NLERR(log, "nsh attr %d is out of range max %d", + type, OVS_NSH_KEY_ATTR_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_nsh_key_attr_lens[type].len)) { + OVS_NLERR( + log, + "nsh attr %d has unexpected len %d expected %d", + type, + nla_len(a), + ovs_nsh_key_attr_lens[type].len + ); + return -EINVAL; + } + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + + has_base = true; + mdtype = base->mdtype; + SW_FLOW_KEY_PUT(match, nsh.base.flags, + base->flags, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.ttl, + base->ttl, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.mdtype, + base->mdtype, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.np, + base->np, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.path_hdr, + base->path_hdr, is_mask); + break; + } + case OVS_NSH_KEY_ATTR_MD1: { + const struct ovs_nsh_key_md1 *md1 = nla_data(a); + + has_md1 = true; + for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) + SW_FLOW_KEY_PUT(match, nsh.context[i], + md1->context[i], is_mask); + break; + } + case OVS_NSH_KEY_ATTR_MD2: + if (!is_push_nsh) /* Not supported MD type 2 yet */ + return -ENOTSUPP; + + has_md2 = true; + mdlen = nla_len(a); + if (mdlen > NSH_CTX_HDRS_MAX_LEN || mdlen <= 0) { + OVS_NLERR( + log, + "Invalid MD length %d for MD type %d", + mdlen, + mdtype + ); + return -EINVAL; + } + break; + default: + OVS_NLERR(log, "Unknown nsh attribute %d", + type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem); + return -EINVAL; + } + + if (has_md1 && has_md2) { + OVS_NLERR( + 1, + "invalid nsh attribute: md1 and md2 are exclusive." + ); + return -EINVAL; + } + + if (!is_mask) { + if ((has_md1 && mdtype != NSH_M_TYPE1) || + (has_md2 && mdtype != NSH_M_TYPE2)) { + OVS_NLERR(1, "nsh attribute has unmatched MD type %d.", + mdtype); + return -EINVAL; + } + + if (is_push_nsh && + (!has_base || (!has_md1 && !has_md2))) { + OVS_NLERR( + 1, + "push_nsh: missing base or metadata attributes" + ); + return -EINVAL; + } + } + + return 0; +} + static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, u64 attrs, const struct nlattr **a, bool is_mask, bool log) @@ -1354,6 +1605,13 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, attrs &= ~(1 << OVS_KEY_ATTR_ARP); } + if (attrs & (1 << OVS_KEY_ATTR_NSH)) { + if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match, + is_mask, false, log) < 0) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_NSH); + } + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { const struct ovs_key_mpls *mpls_key; @@ -1670,6 +1928,34 @@ static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh, return 0; } +static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_KEY_ATTR_NSH); + if (!start) + return -EMSGSIZE; + + if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(nsh->base), &nsh->base)) + goto nla_put_failure; + + if (is_mask || nsh->base.mdtype == NSH_M_TYPE1) { + if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1, + sizeof(nsh->context), nsh->context)) + goto nla_put_failure; + } + + /* Don't support MD type 2 yet */ + + nla_nest_end(skb, start); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int __ovs_nla_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, bool is_mask, struct sk_buff *skb) @@ -1798,6 +2084,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ipv6_key->ipv6_tclass = output->ip.tos; ipv6_key->ipv6_hlimit = output->ip.ttl; ipv6_key->ipv6_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_NSH)) { + if (nsh_key_to_nlattr(&output->nsh, is_mask, skb)) + goto nla_put_failure; } else if (swkey->eth.type == htons(ETH_P_ARP) || swkey->eth.type == htons(ETH_P_RARP)) { struct ovs_key_arp *arp_key; @@ -2292,6 +2581,19 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return err; } +static bool validate_nsh(const struct nlattr *attr, bool is_mask, + bool is_push_nsh, bool log) +{ + struct sw_flow_match match; + struct sw_flow_key key; + int ret = 0; + + ovs_match_init(&match, &key, true, NULL); + ret = nsh_key_put_from_nlattr(attr, &match, is_mask, + is_push_nsh, log); + return !ret; +} + /* Return false if there are any non-masked bits set. * Mask follows data immediately, before any netlink padding. */ @@ -2434,6 +2736,13 @@ static int validate_set(const struct nlattr *a, break; + case OVS_KEY_ATTR_NSH: + if (eth_type != htons(ETH_P_NSH)) + return -EINVAL; + if (!validate_nsh(nla_data(a), masked, false, log)) + return -EINVAL; + break; + default: return -EINVAL; } @@ -2533,6 +2842,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), [OVS_ACTION_ATTR_POP_ETH] = 0, + [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, + [OVS_ACTION_ATTR_POP_NSH] = 0, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2690,6 +3001,34 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, mac_proto = MAC_PROTO_ETHERNET; break; + case OVS_ACTION_ATTR_PUSH_NSH: + if (mac_proto != MAC_PROTO_ETHERNET) { + u8 next_proto; + + next_proto = tun_p_from_eth_p(eth_type); + if (!next_proto) + return -EINVAL; + } + mac_proto = MAC_PROTO_NONE; + if (!validate_nsh(nla_data(a), false, true, true)) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_POP_NSH: { + __be16 inner_proto; + + if (eth_type != htons(ETH_P_NSH)) + return -EINVAL; + inner_proto = tun_p_to_eth_p(key->nsh.base.np); + if (!inner_proto) + return -EINVAL; + if (key->nsh.base.np == TUN_P_ETHERNET) + mac_proto = MAC_PROTO_ETHERNET; + else + mac_proto = MAC_PROTO_NONE; + break; + } + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 929c665ac3aa..6657606b2b47 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -79,4 +79,9 @@ int ovs_nla_put_actions(const struct nlattr *attr, void ovs_nla_free_flow_actions(struct sw_flow_actions *); void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); +int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh, + struct ovs_key_nsh *nsh_mask); +int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, + size_t size); + #endif /* flow_netlink.h */ -- cgit v1.2.3 From 24a9332a58b7f41a0d36c35a2c6897242bffdbc0 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:43 -0500 Subject: net: dsa: constify cpu_dp member of dsa_port A DSA port has a dedicated CPU port assigned to it, stored in the cpu_dp member. It is not meant to be modified by a port, thus make it const. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 814ced75a0cc..cc7fe47dd4bf 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1147,7 +1147,7 @@ static void dsa_slave_notify(struct net_device *dev, unsigned long val) int dsa_slave_create(struct dsa_port *port) { - struct dsa_port *cpu_dp = port->cpu_dp; + const struct dsa_port *cpu_dp = port->cpu_dp; struct net_device *master = cpu_dp->master; struct dsa_switch *ds = port->ds; const char *name = port->name; -- cgit v1.2.3 From f070464cf000131928b2c3fd592efd1946610eea Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:44 -0500 Subject: net: dsa: setup and teardown default CPU port The dsa_dst_parse function called just before dsa_dst_apply does not parse the tree but does only one thing: it assigns the default CPU port to dst->cpu_dp and to each user ports. This patch simplifies this by calling a dsa_tree_setup_default_cpu function at the beginning of dsa_dst_apply directly. A dsa_port_is_user helper is added for convenience. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 153 +++++++++++++++++++++++++-------------------------------- 1 file changed, 68 insertions(+), 85 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 283104e5ca6a..0a63a2119cd0 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -112,6 +112,11 @@ static bool dsa_port_is_cpu(struct dsa_port *port) return port->type == DSA_PORT_TYPE_CPU; } +static bool dsa_port_is_user(struct dsa_port *dp) +{ + return dp->type == DSA_PORT_TYPE_USER; +} + static bool dsa_ds_find_port_dn(struct dsa_switch *ds, struct device_node *port) { @@ -218,6 +223,64 @@ static int dsa_dst_complete(struct dsa_switch_tree *dst) return 0; } +static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + struct dsa_port *dp; + int device, port; + + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; + if (!ds) + continue; + + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + if (dsa_port_is_cpu(dp)) + return dp; + } + } + + return NULL; +} + +static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + struct dsa_port *dp; + int device, port; + + /* DSA currently only supports a single CPU port */ + dst->cpu_dp = dsa_tree_find_first_cpu(dst); + if (!dst->cpu_dp) { + pr_warn("Tree has no master device\n"); + return -EINVAL; + } + + /* Assign the default CPU port to all ports of the fabric */ + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; + if (!ds) + continue; + + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + if (dsa_port_is_user(dp)) + dp->cpu_dp = dst->cpu_dp; + } + } + + return 0; +} + +static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) +{ + /* DSA currently only supports a single CPU port */ + dst->cpu_dp = NULL; +} + static int dsa_dsa_port_apply(struct dsa_port *port) { struct dsa_switch *ds = port->ds; @@ -412,6 +475,10 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) u32 index; int err; + err = dsa_tree_setup_default_cpu(dst); + if (err) + return err; + for (index = 0; index < DSA_MAX_SWITCHES; index++) { ds = dst->ds[index]; if (!ds) @@ -464,7 +531,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - dst->cpu_dp = NULL; + dsa_tree_teardown_default_cpu(dst); pr_info("DSA: tree %d unapplied\n", dst->index); dst->applied = false; @@ -532,86 +599,6 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) return 0; } -static int dsa_cpu_parse(struct dsa_port *port, u32 index, - struct dsa_switch_tree *dst, - struct dsa_switch *ds) -{ - if (!dst->cpu_dp) - dst->cpu_dp = port; - - return 0; -} - -static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) -{ - struct dsa_port *port; - u32 index; - int err; - - for (index = 0; index < ds->num_ports; index++) { - port = &ds->ports[index]; - if (!dsa_port_is_valid(port) || - dsa_port_is_dsa(port)) - continue; - - if (dsa_port_is_cpu(port)) { - err = dsa_cpu_parse(port, index, dst, ds); - if (err) - return err; - } - - } - - pr_info("DSA: switch %d %d parsed\n", dst->index, ds->index); - - return 0; -} - -static int dsa_dst_parse(struct dsa_switch_tree *dst) -{ - struct dsa_switch *ds; - struct dsa_port *dp; - u32 index; - int port; - int err; - - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; - if (!ds) - continue; - - err = dsa_ds_parse(dst, ds); - if (err) - return err; - } - - if (!dst->cpu_dp) { - pr_warn("Tree has no master device\n"); - return -EINVAL; - } - - /* Assign the default CPU port to all ports of the fabric */ - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; - if (!ds) - continue; - - for (port = 0; port < ds->num_ports; port++) { - dp = &ds->ports[port]; - if (!dsa_port_is_valid(dp) || - dsa_port_is_dsa(dp) || - dsa_port_is_cpu(dp)) - continue; - - dp->cpu_dp = dst->cpu_dp; - } - } - - pr_info("DSA: tree %d parsed\n", dst->index); - - return 0; -} - static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) { struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0); @@ -810,10 +797,6 @@ static int _dsa_register_switch(struct dsa_switch *ds) return -EINVAL; } - err = dsa_dst_parse(dst); - if (err) - goto out_del_dst; - err = dsa_dst_apply(dst); if (err) { dsa_dst_unapply(dst); -- cgit v1.2.3 From 17a22fcfc84a422d98a0f54e67d4ee8ee3875849 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:45 -0500 Subject: net: dsa: setup and teardown master device Add DSA helpers to setup and teardown a master net device wired to its CPU port. This centralizes the dsa_ptr assignment. This also makes the master ethtool helpers static at the same time. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 36 +++++++++++++++++++----------------- net/dsa/dsa_priv.h | 4 ++-- net/dsa/legacy.c | 20 ++------------------ net/dsa/master.c | 30 ++++++++++++++++++++++++++++-- 4 files changed, 51 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 0a63a2119cd0..c9b50339fcac 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -469,6 +469,23 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) } +static int dsa_tree_setup_master(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp = dst->cpu_dp; + struct net_device *master = cpu_dp->master; + + /* DSA currently supports a single pair of CPU port and master device */ + return dsa_master_setup(master, cpu_dp); +} + +static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp = dst->cpu_dp; + struct net_device *master = cpu_dp->master; + + return dsa_master_teardown(master); +} + static int dsa_dst_apply(struct dsa_switch_tree *dst) { struct dsa_switch *ds; @@ -489,14 +506,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - /* If we use a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point on get - * sent to the tag format's receive function. - */ - wmb(); - dst->cpu_dp->master->dsa_ptr = dst->cpu_dp; - - err = dsa_master_ethtool_setup(dst->cpu_dp->master); + err = dsa_tree_setup_master(dst); if (err) return err; @@ -513,15 +523,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; - dsa_master_ethtool_restore(dst->cpu_dp->master); - - dst->cpu_dp->master->dsa_ptr = NULL; - - /* If we used a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point get sent - * without the tag and go through the regular receive path. - */ - wmb(); + dsa_tree_teardown_master(dst); for (index = 0; index < DSA_MAX_SWITCHES; index++) { ds = dst->ds[index]; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 253a613c40cd..bb0218c1b570 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -108,8 +108,8 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], const unsigned char *addr, u16 vid); /* master.c */ -int dsa_master_ethtool_setup(struct net_device *dev); -void dsa_master_ethtool_restore(struct net_device *dev); +int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp); +void dsa_master_teardown(struct net_device *dev); static inline struct net_device *dsa_master_find_slave(struct net_device *dev, int device, int port) diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 0511fe2feff7..4863e3e398b6 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -593,15 +593,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, if (!configured) return -EPROBE_DEFER; - /* - * If we use a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point on get - * sent to the tag format's receive function. - */ - wmb(); - dev->dsa_ptr = dst->cpu_dp; - - return dsa_master_ethtool_setup(dst->cpu_dp->master); + return dsa_master_setup(dst->cpu_dp->master, dst->cpu_dp); } static int dsa_probe(struct platform_device *pdev) @@ -666,15 +658,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; - dsa_master_ethtool_restore(dst->cpu_dp->master); - - dst->cpu_dp->master->dsa_ptr = NULL; - - /* If we used a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point get sent - * without the tag and go through the regular receive path. - */ - wmb(); + dsa_master_teardown(dst->cpu_dp->master); for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; diff --git a/net/dsa/master.c b/net/dsa/master.c index 5f3f57e372e0..00589147f042 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -85,7 +85,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, } } -int dsa_master_ethtool_setup(struct net_device *dev) +static int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch *ds = cpu_dp->ds; @@ -108,10 +108,36 @@ int dsa_master_ethtool_setup(struct net_device *dev) return 0; } -void dsa_master_ethtool_restore(struct net_device *dev) +static void dsa_master_ethtool_teardown(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; dev->ethtool_ops = cpu_dp->orig_ethtool_ops; cpu_dp->orig_ethtool_ops = NULL; } + +int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) +{ + /* If we use a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point on get + * sent to the tag format's receive function. + */ + wmb(); + + dev->dsa_ptr = cpu_dp; + + return dsa_master_ethtool_setup(dev); +} + +void dsa_master_teardown(struct net_device *dev) +{ + dsa_master_ethtool_teardown(dev); + + dev->dsa_ptr = NULL; + + /* If we used a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point get sent + * without the tag and go through the regular receive path. + */ + wmb(); +} -- cgit v1.2.3 From ec15dd4269d0cbf947c9a2dfdcf08a917098fab1 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:46 -0500 Subject: net: dsa: setup and teardown tree This commit provides better scope for the DSA tree setup and teardown functions. It renames the "applied" bool to "setup" and print a message when the tree is setup, as it is done during teardown. At the same time, check dst->setup in dsa_tree_setup, where it is set to true. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c9b50339fcac..1a8df0a177b5 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -486,12 +486,18 @@ static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) return dsa_master_teardown(master); } -static int dsa_dst_apply(struct dsa_switch_tree *dst) +static int dsa_tree_setup(struct dsa_switch_tree *dst) { struct dsa_switch *ds; u32 index; int err; + if (dst->setup) { + pr_err("DSA: tree %d already setup! Disjoint trees?\n", + dst->index); + return -EEXIST; + } + err = dsa_tree_setup_default_cpu(dst); if (err) return err; @@ -510,17 +516,19 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) if (err) return err; - dst->applied = true; + dst->setup = true; + + pr_info("DSA: tree %d setup\n", dst->index); return 0; } -static void dsa_dst_unapply(struct dsa_switch_tree *dst) +static void dsa_tree_teardown(struct dsa_switch_tree *dst) { struct dsa_switch *ds; u32 index; - if (!dst->applied) + if (!dst->setup) return; dsa_tree_teardown_master(dst); @@ -535,8 +543,9 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_tree_teardown_default_cpu(dst); - pr_info("DSA: tree %d unapplied\n", dst->index); - dst->applied = false; + pr_info("DSA: tree %d torn down\n", dst->index); + + dst->setup = false; } static void dsa_tree_remove_switch(struct dsa_switch_tree *dst, @@ -794,14 +803,9 @@ static int _dsa_register_switch(struct dsa_switch *ds) if (err == 1) return 0; - if (dst->applied) { - pr_info("DSA: Disjoint trees?\n"); - return -EINVAL; - } - - err = dsa_dst_apply(dst); + err = dsa_tree_setup(dst); if (err) { - dsa_dst_unapply(dst); + dsa_tree_teardown(dst); goto out_del_dst; } @@ -852,7 +856,7 @@ static void _dsa_unregister_switch(struct dsa_switch *ds) struct dsa_switch_tree *dst = ds->dst; unsigned int index = ds->index; - dsa_dst_unapply(dst); + dsa_tree_teardown(dst); dsa_tree_remove_switch(dst, index); } -- cgit v1.2.3 From 1f08f9e9cbc63d92c246f40ae4221cba86ef8ec6 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:47 -0500 Subject: net: dsa: setup and teardown switches This patches brings no functional changes. It removes the unused dst argument from the dsa_ds_apply and dsa_ds_unapply functions, rename them to dsa_switch_setup and dsa_switch_teardown for a more explicit scope. This clarifies the steps of the setup or teardown of a switch fabric. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 62 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 1a8df0a177b5..2b3b2a86791d 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -362,7 +362,7 @@ static void dsa_user_port_unapply(struct dsa_port *port) } } -static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +static int dsa_switch_setup(struct dsa_switch *ds) { struct dsa_port *port; u32 index; @@ -433,7 +433,7 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) return 0; } -static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +static void dsa_switch_teardown(struct dsa_switch *ds) { struct dsa_port *port; u32 index; @@ -469,6 +469,39 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) } +static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + int device; + int err; + + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; + if (!ds) + continue; + + err = dsa_switch_setup(ds); + if (err) + return err; + } + + return 0; +} + +static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + int device; + + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; + if (!ds) + continue; + + dsa_switch_teardown(ds); + } +} + static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { struct dsa_port *cpu_dp = dst->cpu_dp; @@ -488,8 +521,6 @@ static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) static int dsa_tree_setup(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; - u32 index; int err; if (dst->setup) { @@ -502,15 +533,9 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) if (err) return err; - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; - if (!ds) - continue; - - err = dsa_ds_apply(dst, ds); - if (err) - return err; - } + err = dsa_tree_setup_switches(dst); + if (err) + return err; err = dsa_tree_setup_master(dst); if (err) @@ -525,21 +550,12 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) static void dsa_tree_teardown(struct dsa_switch_tree *dst) { - struct dsa_switch *ds; - u32 index; - if (!dst->setup) return; dsa_tree_teardown_master(dst); - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; - if (!ds) - continue; - - dsa_ds_unapply(dst, ds); - } + dsa_tree_teardown_switches(dst); dsa_tree_teardown_default_cpu(dst); -- cgit v1.2.3 From 1d27732f411d57f0168af30be2adb504b8b7749d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:48 -0500 Subject: net: dsa: setup and teardown ports The dsa_dsa_port_apply and dsa_cpu_port_apply functions do exactly the same. The dsa_user_port_apply function does not try to register a fixed link but try to create a slave. This commit factorizes and scopes all that in two convenient dsa_port_setup and dsa_port_teardown functions. It won't hurt to register a devlink_port for unused port as well. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 173 ++++++++++++++++++++------------------------------------- 1 file changed, 59 insertions(+), 114 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 2b3b2a86791d..676c0bc943dd 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -281,91 +281,65 @@ static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) dst->cpu_dp = NULL; } -static int dsa_dsa_port_apply(struct dsa_port *port) +static int dsa_port_setup(struct dsa_port *dp) { - struct dsa_switch *ds = port->ds; + struct dsa_switch *ds = dp->ds; int err; - err = dsa_port_fixed_link_register_of(port); - if (err) { - dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n", - port->index, err); - return err; - } - - memset(&port->devlink_port, 0, sizeof(port->devlink_port)); - - return devlink_port_register(ds->devlink, &port->devlink_port, - port->index); -} - -static void dsa_dsa_port_unapply(struct dsa_port *port) -{ - devlink_port_unregister(&port->devlink_port); - dsa_port_fixed_link_unregister_of(port); -} - -static int dsa_cpu_port_apply(struct dsa_port *port) -{ - struct dsa_switch *ds = port->ds; - int err; + memset(&dp->devlink_port, 0, sizeof(dp->devlink_port)); - err = dsa_port_fixed_link_register_of(port); - if (err) { - dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n", - port->index, err); + err = devlink_port_register(ds->devlink, &dp->devlink_port, dp->index); + if (err) return err; - } - - memset(&port->devlink_port, 0, sizeof(port->devlink_port)); - err = devlink_port_register(ds->devlink, &port->devlink_port, - port->index); - return err; -} - -static void dsa_cpu_port_unapply(struct dsa_port *port) -{ - devlink_port_unregister(&port->devlink_port); - dsa_port_fixed_link_unregister_of(port); -} -static int dsa_user_port_apply(struct dsa_port *port) -{ - struct dsa_switch *ds = port->ds; - int err; + switch (dp->type) { + case DSA_PORT_TYPE_UNUSED: + break; + case DSA_PORT_TYPE_CPU: + case DSA_PORT_TYPE_DSA: + err = dsa_port_fixed_link_register_of(dp); + if (err) { + dev_err(ds->dev, "failed to register fixed link for port %d.%d\n", + ds->index, dp->index); + return err; + } - err = dsa_slave_create(port); - if (err) { - dev_warn(ds->dev, "Failed to create slave %d: %d\n", - port->index, err); - port->slave = NULL; - return err; + break; + case DSA_PORT_TYPE_USER: + err = dsa_slave_create(dp); + if (err) + dev_err(ds->dev, "failed to create slave for port %d.%d\n", + ds->index, dp->index); + else + devlink_port_type_eth_set(&dp->devlink_port, dp->slave); + break; } - memset(&port->devlink_port, 0, sizeof(port->devlink_port)); - err = devlink_port_register(ds->devlink, &port->devlink_port, - port->index); - if (err) - return err; - - devlink_port_type_eth_set(&port->devlink_port, port->slave); - return 0; } -static void dsa_user_port_unapply(struct dsa_port *port) +static void dsa_port_teardown(struct dsa_port *dp) { - devlink_port_unregister(&port->devlink_port); - if (port->slave) { - dsa_slave_destroy(port->slave); - port->slave = NULL; + devlink_port_unregister(&dp->devlink_port); + + switch (dp->type) { + case DSA_PORT_TYPE_UNUSED: + break; + case DSA_PORT_TYPE_CPU: + case DSA_PORT_TYPE_DSA: + dsa_port_fixed_link_unregister_of(dp); + break; + case DSA_PORT_TYPE_USER: + if (dp->slave) { + dsa_slave_destroy(dp->slave); + dp->slave = NULL; + } + break; } } static int dsa_switch_setup(struct dsa_switch *ds) { - struct dsa_port *port; - u32 index; int err; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus @@ -406,56 +380,11 @@ static int dsa_switch_setup(struct dsa_switch *ds) return err; } - for (index = 0; index < ds->num_ports; index++) { - port = &ds->ports[index]; - if (!dsa_port_is_valid(port)) - continue; - - if (dsa_port_is_dsa(port)) { - err = dsa_dsa_port_apply(port); - if (err) - return err; - continue; - } - - if (dsa_port_is_cpu(port)) { - err = dsa_cpu_port_apply(port); - if (err) - return err; - continue; - } - - err = dsa_user_port_apply(port); - if (err) - continue; - } - return 0; } static void dsa_switch_teardown(struct dsa_switch *ds) { - struct dsa_port *port; - u32 index; - - for (index = 0; index < ds->num_ports; index++) { - port = &ds->ports[index]; - if (!dsa_port_is_valid(port)) - continue; - - if (dsa_port_is_dsa(port)) { - dsa_dsa_port_unapply(port); - continue; - } - - if (dsa_port_is_cpu(port)) { - dsa_cpu_port_unapply(port); - continue; - } - - dsa_user_port_unapply(port); - } - if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); @@ -472,7 +401,8 @@ static void dsa_switch_teardown(struct dsa_switch *ds) static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) { struct dsa_switch *ds; - int device; + struct dsa_port *dp; + int device, port; int err; for (device = 0; device < DSA_MAX_SWITCHES; device++) { @@ -483,6 +413,14 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) err = dsa_switch_setup(ds); if (err) return err; + + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + err = dsa_port_setup(dp); + if (err) + return err; + } } return 0; @@ -491,13 +429,20 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) { struct dsa_switch *ds; - int device; + struct dsa_port *dp; + int device, port; for (device = 0; device < DSA_MAX_SWITCHES; device++) { ds = dst->ds[device]; if (!ds) continue; + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + dsa_port_teardown(dp); + } + dsa_switch_teardown(ds); } } -- cgit v1.2.3 From f163da8853aa9d8060157a96ed314299b87ba070 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:49 -0500 Subject: net: dsa: add find port by node helper Instead of having two dsa_ds_find_port_dn (which returns a bool) and dsa_dst_find_port_dn (which returns a switch) functions, provide a more explicit dsa_tree_find_port_by_node function which returns a matching port. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 676c0bc943dd..0f6f8c1701f9 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -117,30 +117,24 @@ static bool dsa_port_is_user(struct dsa_port *dp) return dp->type == DSA_PORT_TYPE_USER; } -static bool dsa_ds_find_port_dn(struct dsa_switch *ds, - struct device_node *port) -{ - u32 index; - - for (index = 0; index < ds->num_ports; index++) - if (ds->ports[index].dn == port) - return true; - return false; -} - -static struct dsa_switch *dsa_dst_find_port_dn(struct dsa_switch_tree *dst, - struct device_node *port) +static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, + struct device_node *dn) { struct dsa_switch *ds; - u32 index; + struct dsa_port *dp; + int device, port; - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; if (!ds) continue; - if (dsa_ds_find_port_dn(ds, port)) - return ds; + for (port = 0; port < ds->num_ports; port++) { + dp = &ds->ports[port]; + + if (dp->dn == dn) + return dp; + } } return NULL; @@ -154,18 +148,21 @@ static int dsa_port_complete(struct dsa_switch_tree *dst, struct device_node *link; int index; struct dsa_switch *dst_ds; + struct dsa_port *link_dp; for (index = 0;; index++) { link = of_parse_phandle(port->dn, "link", index); if (!link) break; - dst_ds = dsa_dst_find_port_dn(dst, link); + link_dp = dsa_tree_find_port_by_node(dst, link); of_node_put(link); - if (!dst_ds) + if (!link_dp) return 1; + dst_ds = link_dp->ds; + src_ds->rtable[dst_ds->index] = src_port; } -- cgit v1.2.3 From c52866655558e5fc87ceae8aac528a7e410c8a77 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:50 -0500 Subject: net: dsa: use of_for_each_phandle The OF code provides a of_for_each_phandle() helper to iterate over phandles. Use it instead of arbitrary iterating ourselves over the list of phandles hanging to the "link" property of the port's device node. The of_phandle_iterator_next() helper calls of_node_put() itself on it.node. Thus We must only do it ourselves if we break the loop. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 0f6f8c1701f9..25ed41262ead 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -145,21 +145,18 @@ static int dsa_port_complete(struct dsa_switch_tree *dst, struct dsa_port *port, u32 src_port) { - struct device_node *link; - int index; + struct device_node *dn = port->dn; + struct of_phandle_iterator it; struct dsa_switch *dst_ds; struct dsa_port *link_dp; + int err; - for (index = 0;; index++) { - link = of_parse_phandle(port->dn, "link", index); - if (!link) - break; - - link_dp = dsa_tree_find_port_by_node(dst, link); - of_node_put(link); - - if (!link_dp) + of_for_each_phandle(&it, err, dn, "link", NULL, 0) { + link_dp = dsa_tree_find_port_by_node(dst, it.node); + if (!link_dp) { + of_node_put(it.node); return 1; + } dst_ds = link_dp->ds; -- cgit v1.2.3 From 34c09a8916fb52aac948dfc861b33c0b3b37ac29 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:51 -0500 Subject: net: dsa: setup routing table The *_complete() functions take too much arguments to do only one thing: they try to fetch the dsa_port structures corresponding to device nodes under the "link" list property of DSA ports, and use them to setup the routing table of switches. This patch simplifies them by providing instead simpler dsa_{port,switch,tree}_setup_routing_table functions which return a boolean value, true if the tree is complete. dsa_tree_setup_routing_table is called inside dsa_tree_setup which simplifies the switch registering function as well. A switch's routing table is now initialized before its setup. This also makes dsa_port_is_valid obsolete, remove it. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 108 ++++++++++++++++++++------------------------------------- 1 file changed, 37 insertions(+), 71 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 25ed41262ead..44d26b5977cd 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -94,14 +94,6 @@ static void dsa_tree_put(struct dsa_switch_tree *dst) kref_put(&dst->refcount, dsa_tree_release); } -/* For platform data configurations, we need to have a valid name argument to - * differentiate a disabled port from an enabled one - */ -static bool dsa_port_is_valid(struct dsa_port *port) -{ - return port->type != DSA_PORT_TYPE_UNUSED; -} - static bool dsa_port_is_dsa(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_DSA; @@ -140,14 +132,12 @@ static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, return NULL; } -static int dsa_port_complete(struct dsa_switch_tree *dst, - struct dsa_switch *src_ds, - struct dsa_port *port, - u32 src_port) +static bool dsa_port_setup_routing_table(struct dsa_port *dp) { - struct device_node *dn = port->dn; + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst = ds->dst; + struct device_node *dn = dp->dn; struct of_phandle_iterator it; - struct dsa_switch *dst_ds; struct dsa_port *link_dp; int err; @@ -155,66 +145,54 @@ static int dsa_port_complete(struct dsa_switch_tree *dst, link_dp = dsa_tree_find_port_by_node(dst, it.node); if (!link_dp) { of_node_put(it.node); - return 1; + return false; } - dst_ds = link_dp->ds; - - src_ds->rtable[dst_ds->index] = src_port; + ds->rtable[link_dp->ds->index] = dp->index; } - return 0; + return true; } -/* A switch is complete if all the DSA ports phandles point to ports - * known in the tree. A return value of 1 means the tree is not - * complete. This is not an error condition. A value of 0 is - * success. - */ -static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) +static bool dsa_switch_setup_routing_table(struct dsa_switch *ds) { - struct dsa_port *port; - u32 index; - int err; + bool complete = true; + struct dsa_port *dp; + int i; - for (index = 0; index < ds->num_ports; index++) { - port = &ds->ports[index]; - if (!dsa_port_is_valid(port)) - continue; + for (i = 0; i < DSA_MAX_SWITCHES; i++) + ds->rtable[i] = DSA_RTABLE_NONE; - if (!dsa_port_is_dsa(port)) - continue; + for (i = 0; i < ds->num_ports; i++) { + dp = &ds->ports[i]; - err = dsa_port_complete(dst, ds, port, index); - if (err != 0) - return err; + if (dsa_port_is_dsa(dp)) { + complete = dsa_port_setup_routing_table(dp); + if (!complete) + break; + } } - return 0; + return complete; } -/* A tree is complete if all the DSA ports phandles point to ports - * known in the tree. A return value of 1 means the tree is not - * complete. This is not an error condition. A value of 0 is - * success. - */ -static int dsa_dst_complete(struct dsa_switch_tree *dst) +static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) { struct dsa_switch *ds; - u32 index; - int err; + bool complete = true; + int device; - for (index = 0; index < DSA_MAX_SWITCHES; index++) { - ds = dst->ds[index]; + for (device = 0; device < DSA_MAX_SWITCHES; device++) { + ds = dst->ds[device]; if (!ds) continue; - err = dsa_ds_complete(dst, ds); - if (err != 0) - return err; + complete = dsa_switch_setup_routing_table(ds); + if (!complete) + break; } - return 0; + return complete; } static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) @@ -460,6 +438,7 @@ static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) static int dsa_tree_setup(struct dsa_switch_tree *dst) { + bool complete; int err; if (dst->setup) { @@ -468,6 +447,10 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) return -EEXIST; } + complete = dsa_tree_setup_routing_table(dst); + if (!complete) + return 0; + err = dsa_tree_setup_default_cpu(dst); if (err) return err; @@ -727,7 +710,7 @@ static int _dsa_register_switch(struct dsa_switch *ds) struct device_node *np = ds->dev->of_node; struct dsa_switch_tree *dst; unsigned int index; - int i, err; + int err; if (np) err = dsa_switch_parse_of(ds, np); @@ -742,33 +725,16 @@ static int _dsa_register_switch(struct dsa_switch *ds) index = ds->index; dst = ds->dst; - /* Initialize the routing table */ - for (i = 0; i < DSA_MAX_SWITCHES; ++i) - ds->rtable[i] = DSA_RTABLE_NONE; - err = dsa_tree_add_switch(dst, ds); if (err) return err; - err = dsa_dst_complete(dst); - if (err < 0) - goto out_del_dst; - - /* Not all switches registered yet */ - if (err == 1) - return 0; - err = dsa_tree_setup(dst); if (err) { dsa_tree_teardown(dst); - goto out_del_dst; + dsa_tree_remove_switch(dst, index); } - return 0; - -out_del_dst: - dsa_tree_remove_switch(dst, index); - return err; } -- cgit v1.2.3 From 308173546ac4342103541e8d4e4ce83d1a5e7eba Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:52 -0500 Subject: net: dsa: setup a tree when adding a switch to it Now that the tree setup is centralized, we can simplify the code a bit more by setting up or tearing down the tree directly when adding or removing a switch to/from it. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 44d26b5977cd..3db50e68640e 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -489,6 +489,8 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) static void dsa_tree_remove_switch(struct dsa_switch_tree *dst, unsigned int index) { + dsa_tree_teardown(dst); + dst->ds[index] = NULL; dsa_tree_put(dst); } @@ -497,6 +499,7 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst, struct dsa_switch *ds) { unsigned int index = ds->index; + int err; if (dst->ds[index]) return -EBUSY; @@ -504,7 +507,11 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst, dsa_tree_get(dst); dst->ds[index] = ds; - return 0; + err = dsa_tree_setup(dst); + if (err) + dsa_tree_remove_switch(dst, index); + + return err; } static int dsa_port_parse_user(struct dsa_port *dp, const char *name) @@ -704,12 +711,17 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) return dsa_switch_parse_ports(ds, cd); } +static int dsa_switch_add(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + + return dsa_tree_add_switch(dst, ds); +} + static int _dsa_register_switch(struct dsa_switch *ds) { struct dsa_chip_data *pdata = ds->dev->platform_data; struct device_node *np = ds->dev->of_node; - struct dsa_switch_tree *dst; - unsigned int index; int err; if (np) @@ -722,20 +734,7 @@ static int _dsa_register_switch(struct dsa_switch *ds) if (err) return err; - index = ds->index; - dst = ds->dst; - - err = dsa_tree_add_switch(dst, ds); - if (err) - return err; - - err = dsa_tree_setup(dst); - if (err) { - dsa_tree_teardown(dst); - dsa_tree_remove_switch(dst, index); - } - - return err; + return dsa_switch_add(ds); } struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) @@ -777,8 +776,6 @@ static void _dsa_unregister_switch(struct dsa_switch *ds) struct dsa_switch_tree *dst = ds->dst; unsigned int index = ds->index; - dsa_tree_teardown(dst); - dsa_tree_remove_switch(dst, index); } -- cgit v1.2.3 From b4fbb347fe4cd7988d0f9453a7e3ab0cd1b4a75a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:53 -0500 Subject: net: dsa: rename probe and remove switch functions This commit brings no functional changes. It gets rid of the underscore prefixed _dsa_register_switch and _dsa_unregister_switch functions in favor of dsa_switch_probe() which parses and adds a switch to a tree and dsa_switch_remove() which removes a switch from a tree. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 3db50e68640e..fd54a8e17986 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -718,7 +718,7 @@ static int dsa_switch_add(struct dsa_switch *ds) return dsa_tree_add_switch(dst, ds); } -static int _dsa_register_switch(struct dsa_switch *ds) +static int dsa_switch_probe(struct dsa_switch *ds) { struct dsa_chip_data *pdata = ds->dev->platform_data; struct device_node *np = ds->dev->of_node; @@ -764,14 +764,14 @@ int dsa_register_switch(struct dsa_switch *ds) int err; mutex_lock(&dsa2_mutex); - err = _dsa_register_switch(ds); + err = dsa_switch_probe(ds); mutex_unlock(&dsa2_mutex); return err; } EXPORT_SYMBOL_GPL(dsa_register_switch); -static void _dsa_unregister_switch(struct dsa_switch *ds) +static void dsa_switch_remove(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; unsigned int index = ds->index; @@ -782,7 +782,7 @@ static void _dsa_unregister_switch(struct dsa_switch *ds) void dsa_unregister_switch(struct dsa_switch *ds) { mutex_lock(&dsa2_mutex); - _dsa_unregister_switch(ds); + dsa_switch_remove(ds); mutex_unlock(&dsa2_mutex); } EXPORT_SYMBOL_GPL(dsa_unregister_switch); -- cgit v1.2.3 From 72ad533acc22870156736c2fef4674c01307695e Mon Sep 17 00:00:00 2001 From: Mark Greer Date: Thu, 15 Jun 2017 20:34:21 -0700 Subject: NFC: digital: Abort cmd when deactivating target When deactivating an active target, the outstanding command should be aborted. Signed-off-by: Mark Greer Signed-off-by: Samuel Ortiz --- net/nfc/digital_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index de6dd37d04c7..ec0a8998e52d 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -650,6 +650,7 @@ static void digital_deactivate_target(struct nfc_dev *nfc_dev, return; } + digital_abort_cmd(ddev); ddev->curr_protocol = 0; } -- cgit v1.2.3 From 4d63adfe12dd9cb61ed8badb4d798955399048c2 Mon Sep 17 00:00:00 2001 From: Mark Greer Date: Thu, 15 Jun 2017 20:34:22 -0700 Subject: NFC: Add NFC_CMD_DEACTIVATE_TARGET support Once an NFC target (i.e., a tag) is found, it remains active until there is a failure reading or writing it (often caused by the target moving out of range). While the target is active, the NFC adapter and antenna must remain powered. This wastes power when the target remains in range but the client application no longer cares whether it is there or not. To mitigate this, add a new netlink command that allows userspace to deactivate an active target. When issued, this command will cause the NFC subsystem to act as though the target was moved out of range. Once the command has been executed, the client application can power off the NFC adapter to reduce power consumption. Signed-off-by: Mark Greer Signed-off-by: Samuel Ortiz --- net/nfc/netlink.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index b251fb936a27..f6359c277212 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -928,6 +928,30 @@ static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) return rc; } +static int nfc_genl_deactivate_target(struct sk_buff *skb, + struct genl_info *info) +{ + struct nfc_dev *dev; + u32 device_idx, target_idx; + int rc; + + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + return -EINVAL; + + device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + + dev = nfc_get_device(device_idx); + if (!dev) + return -ENODEV; + + target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); + + rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); + + nfc_put_device(dev); + return rc; +} + static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; @@ -1751,6 +1775,11 @@ static const struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_vendor_cmd, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_DEACTIVATE_TARGET, + .doit = nfc_genl_deactivate_target, + .policy = nfc_genl_policy, + }, }; static struct genl_family nfc_genl_family __ro_after_init = { -- cgit v1.2.3 From ff0fd34eaee9978f9ed7f6e2ac47f9590d4afac3 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 23:10:57 +0100 Subject: net: bridge: Rename mglist to host_joined The boolean mglist indicates the host has joined a particular multicast group on the bridge interface. It is badly named, obscuring what is means. Rename it. Signed-off-by: Andrew Lunn Acked-by: Nikolay Aleksandrov Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- net/bridge/br_input.c | 2 +- net/bridge/br_mdb.c | 2 +- net/bridge/br_multicast.c | 14 +++++++------- net/bridge/br_private.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index a096d3e189da..7f98a7d25866 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -137,7 +137,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(br, eth_hdr(skb))) { - if ((mdst && mdst->mglist) || + if ((mdst && mdst->host_joined) || br_multicast_is_router(br)) { local_rcv = true; br->dev->stats.multicast++; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 31ddff22563e..aa716a33cb71 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -655,7 +655,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) call_rcu_bh(&p->rcu, br_multicast_free_pg); err = 0; - if (!mp->ports && !mp->mglist && + if (!mp->ports && !mp->host_joined && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); break; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 5f7f0e9d446c..bfe5adb1f51c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -249,7 +249,7 @@ static void br_multicast_group_expired(struct timer_list *t) if (!netif_running(br->dev) || timer_pending(&mp->timer)) goto out; - mp->mglist = false; + mp->host_joined = false; if (mp->ports) goto out; @@ -292,7 +292,7 @@ static void br_multicast_del_pg(struct net_bridge *br, p->flags); call_rcu_bh(&p->rcu, br_multicast_free_pg); - if (!mp->ports && !mp->mglist && + if (!mp->ports && !mp->host_joined && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); @@ -773,7 +773,7 @@ static int br_multicast_add_group(struct net_bridge *br, goto err; if (!port) { - mp->mglist = true; + mp->host_joined = true; mod_timer(&mp->timer, now + br->multicast_membership_interval); goto out; } @@ -1477,7 +1477,7 @@ static int br_ip4_multicast_query(struct net_bridge *br, max_delay *= br->multicast_last_member_count; - if (mp->mglist && + if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) @@ -1561,7 +1561,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, goto out; max_delay *= br->multicast_last_member_count; - if (mp->mglist && + if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) @@ -1622,7 +1622,7 @@ br_multicast_leave_group(struct net_bridge *br, br_mdb_notify(br->dev, port, group, RTM_DELMDB, p->flags); - if (!mp->ports && !mp->mglist && + if (!mp->ports && !mp->host_joined && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } @@ -1662,7 +1662,7 @@ br_multicast_leave_group(struct net_bridge *br, br->multicast_last_member_interval; if (!port) { - if (mp->mglist && + if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : try_to_del_timer_sync(&mp->timer) >= 0)) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 40553d832b6e..1312b8d20ec3 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -209,7 +209,7 @@ struct net_bridge_mdb_entry struct rcu_head rcu; struct timer_list timer; struct br_ip addr; - bool mglist; + bool host_joined; }; struct net_bridge_mdb_htable -- cgit v1.2.3 From 2a26028d119267a2386733dd71d256f269e70f52 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 23:10:58 +0100 Subject: net: bridge: Send notification when host join/leaves a group The host can join or leave a multicast group on the brX interface, as indicated by IGMP snooping. This is tracked within the bridge multicast code. Send a notification when this happens, in the same way a notification is sent when a port of the bridge joins/leaves a group because of IGMP snooping. Signed-off-by: Andrew Lunn Acked-by: Nikolay Aleksandrov Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 9 ++++++--- net/bridge/br_multicast.c | 6 +++++- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index aa716a33cb71..702408d2a93c 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -317,7 +317,7 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, #endif mdb.obj.orig_dev = port_dev; - if (port_dev && type == RTM_NEWMDB) { + if (p && port_dev && type == RTM_NEWMDB) { complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); if (complete_info) { complete_info->port = p; @@ -327,7 +327,7 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, if (switchdev_port_obj_add(port_dev, &mdb.obj)) kfree(complete_info); } - } else if (port_dev && type == RTM_DELMDB) { + } else if (p && port_dev && type == RTM_DELMDB) { switchdev_port_obj_del(port_dev, &mdb.obj); } @@ -353,7 +353,10 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, struct br_mdb_entry entry; memset(&entry, 0, sizeof(entry)); - entry.ifindex = port->dev->ifindex; + if (port) + entry.ifindex = port->dev->ifindex; + else + entry.ifindex = dev->ifindex; entry.addr.proto = group->proto; entry.addr.u.ip4 = group->u.ip4; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index bfe5adb1f51c..cb4729539b82 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -250,6 +250,7 @@ static void br_multicast_group_expired(struct timer_list *t) goto out; mp->host_joined = false; + br_mdb_notify(br->dev, NULL, &mp->addr, RTM_DELMDB, 0); if (mp->ports) goto out; @@ -773,7 +774,10 @@ static int br_multicast_add_group(struct net_bridge *br, goto err; if (!port) { - mp->host_joined = true; + if (!mp->host_joined) { + mp->host_joined = true; + br_mdb_notify(br->dev, NULL, &mp->addr, RTM_NEWMDB, 0); + } mod_timer(&mp->timer, now + br->multicast_membership_interval); goto out; } -- cgit v1.2.3 From 47d5b6db2afa766d7af85db684d0b5f092e4fc46 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 23:10:59 +0100 Subject: net: bridge: Add/del switchdev object on host join/leave When the host joins or leaves a multicast group, use switchdev to add an object to the hardware to forward traffic for the group to the host. Signed-off-by: Andrew Lunn Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 43 +++++++++++++++++++++++++++++++++++++++++++ net/switchdev/switchdev.c | 2 ++ 2 files changed, 45 insertions(+) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 702408d2a93c..b0f4c734900b 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -292,6 +292,46 @@ err: kfree(priv); } +static void br_mdb_switchdev_host_port(struct net_device *dev, + struct net_device *lower_dev, + struct br_mdb_entry *entry, int type) +{ + struct switchdev_obj_port_mdb mdb = { + .obj = { + .id = SWITCHDEV_OBJ_ID_HOST_MDB, + .flags = SWITCHDEV_F_DEFER, + }, + .vid = entry->vid, + }; + + if (entry->addr.proto == htons(ETH_P_IP)) + ip_eth_mc_map(entry->addr.u.ip4, mdb.addr); +#if IS_ENABLED(CONFIG_IPV6) + else + ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr); +#endif + + mdb.obj.orig_dev = dev; + switch (type) { + case RTM_NEWMDB: + switchdev_port_obj_add(lower_dev, &mdb.obj); + break; + case RTM_DELMDB: + switchdev_port_obj_del(lower_dev, &mdb.obj); + break; + } +} + +static void br_mdb_switchdev_host(struct net_device *dev, + struct br_mdb_entry *entry, int type) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(dev, lower_dev, iter) + br_mdb_switchdev_host_port(dev, lower_dev, entry, type); +} + static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, struct br_mdb_entry *entry, int type) { @@ -331,6 +371,9 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, switchdev_port_obj_del(port_dev, &mdb.obj); } + if (!p) + br_mdb_switchdev_host(dev, entry, type); + skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 0531b41d1f2d..74b9d916a58b 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -345,6 +345,8 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) return sizeof(struct switchdev_obj_port_vlan); case SWITCHDEV_OBJ_ID_PORT_MDB: return sizeof(struct switchdev_obj_port_mdb); + case SWITCHDEV_OBJ_ID_HOST_MDB: + return sizeof(struct switchdev_obj_port_mdb); default: BUG(); } -- cgit v1.2.3 From 5f4dbc50ce4d74b6f57a25fa114fcefe55acce17 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 23:11:00 +0100 Subject: net: dsa: slave: Handle switchdev host mdb add/del Add code to handle switchdev host mdb add/del. Since DSA uses one of the switch ports as a transport to the host, we just need to add an MDB on this port. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/slave.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index cc7fe47dd4bf..35715fda84b2 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -304,6 +304,13 @@ static int dsa_slave_port_obj_add(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_MDB: err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); break; + case SWITCHDEV_OBJ_ID_HOST_MDB: + /* DSA can directly translate this to a normal MDB add, + * but on the CPU port. + */ + err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj), + trans); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), trans); @@ -326,6 +333,12 @@ static int dsa_slave_port_obj_del(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_MDB: err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; + case SWITCHDEV_OBJ_ID_HOST_MDB: + /* DSA can directly translate this to a normal MDB add, + * but on the CPU port. + */ + err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); break; -- cgit v1.2.3 From bb9f603174545c01aea92f116803aeb0e6478b28 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 23:11:01 +0100 Subject: net: dsa: add more const attributes The notify mechanism does not need to modify the port it is notifying. So make the parameter const. Signed-off-by: Andrew Lunn Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 4 ++-- net/dsa/port.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index bb0218c1b570..507e1ce4d4d2 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -147,10 +147,10 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); -int dsa_port_mdb_add(struct dsa_port *dp, +int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans); -int dsa_port_mdb_del(struct dsa_port *dp, +int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, diff --git a/net/dsa/port.c b/net/dsa/port.c index bb30b1a7de3a..a85cd63a91c4 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -17,7 +17,7 @@ #include "dsa_priv.h" -static int dsa_port_notify(struct dsa_port *dp, unsigned long e, void *v) +static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v) { struct raw_notifier_head *nh = &dp->ds->dst->nh; int err; @@ -215,7 +215,7 @@ int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) return ds->ops->port_fdb_dump(ds, port, cb, data); } -int dsa_port_mdb_add(struct dsa_port *dp, +int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans) { @@ -229,7 +229,7 @@ int dsa_port_mdb_add(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, &info); } -int dsa_port_mdb_del(struct dsa_port *dp, +int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb) { struct dsa_notifier_mdb_info info = { -- cgit v1.2.3 From ae45102c9d247516e52f41b3381e195bbc25a362 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 23:11:02 +0100 Subject: net: dsa: switch: Don't add CPU port to an mdb by default Now that the host indicates when a multicast group should be forwarded from the switch to the host, don't do it by default. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/switch.c b/net/dsa/switch.c index e6c06aa349a6..1155e43c157f 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -121,7 +121,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, if (ds->index == info->sw_index) set_bit(info->port, group); for (port = 0; port < ds->num_ports; port++) - if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) + if (dsa_is_dsa_port(ds, port)) set_bit(port, group); if (switchdev_trans_ph_prepare(trans)) { -- cgit v1.2.3 From 2ea7a679ca2abd251c1ec03f20508619707e1749 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 7 Nov 2017 00:04:24 +0100 Subject: net: dsa: Don't add vlans when vlan filtering is disabled The software bridge can be build with vlan filtering support included. However, by default it is turned off. In its turned off state, it still passes VLANs via switchev, even though they are not to be used. Don't pass these VLANs to the hardware. Only do so when vlan filtering is enabled. This fixes at least one corner case. There are still issues in other corners, such as when vlan_filtering is later enabled. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/port.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/port.c b/net/dsa/port.c index a85cd63a91c4..bb4be2679904 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -252,7 +252,10 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; - return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); + if (br_vlan_enabled(dp->bridge_dev)) + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); + + return 0; } int dsa_port_vlan_del(struct dsa_port *dp, @@ -264,7 +267,10 @@ int dsa_port_vlan_del(struct dsa_port *dp, .vlan = vlan, }; - return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); + if (br_vlan_enabled(dp->bridge_dev)) + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); + + return 0; } int dsa_port_fixed_link_register_of(struct dsa_port *dp) -- cgit v1.2.3 From a3dcaf17ee54f1d01d22cc2b22cab0b4f60d78cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:27 -0800 Subject: net: allow per netns sysctl_rmem and sysctl_wmem for protos As we want to gradually implement per netns sysctl_rmem and sysctl_wmem on per protocol basis, add two new fields in struct proto, and two new helpers : sk_get_wmem0() and sk_get_rmem0() First user will be TCP. Then UDP and SCTP can be easily converted, while DECNET probably wont get this support. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 759400053110..c59bcf90d905 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2346,16 +2346,18 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) /* guarantee minimum buffer size under pressure */ if (kind == SK_MEM_RECV) { - if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) + if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; } else { /* SK_MEM_SEND */ + int wmem0 = sk_get_wmem0(sk, prot); + if (sk->sk_type == SOCK_STREAM) { - if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) + if (sk->sk_wmem_queued < wmem0) return 1; - } else if (refcount_read(&sk->sk_wmem_alloc) < - prot->sysctl_wmem[0]) + } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { return 1; + } } if (sk_has_memory_pressure(sk)) { -- cgit v1.2.3 From 356d1833b638bd465672aefeb71def3ab93fc17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:28 -0800 Subject: tcp: Namespace-ify sysctl_tcp_rmem and sysctl_tcp_wmem Note that when a new netns is created, it inherits its sysctl_tcp_rmem and sysctl_tcp_wmem from initial netns. This change is needed so that we can refine TCP rcvbuf autotuning, to take RTT into consideration. Signed-off-by: Eric Dumazet Cc: Wei Wang Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 32 ++++++++++++++++---------------- net/ipv4/tcp.c | 21 ++++++++------------- net/ipv4/tcp_input.c | 14 ++++++++------ net/ipv4/tcp_ipv4.c | 13 ++++++++++--- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 6 files changed, 45 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a82b44038308..ef0ff3357a44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -440,22 +440,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { - .procname = "tcp_wmem", - .data = &sysctl_tcp_wmem, - .maxlen = sizeof(sysctl_tcp_wmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, - { - .procname = "tcp_rmem", - .data = &sysctl_tcp_rmem, - .maxlen = sizeof(sysctl_tcp_rmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, @@ -1164,6 +1148,22 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &zero, .extra2 = &thousand, }, + { + .procname = "tcp_wmem", + .data = &init_net.ipv4.sysctl_tcp_wmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, + { + .procname = "tcp_rmem", + .data = &init_net.ipv4.sysctl_tcp_rmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4cb19ed4628..bc71a27d5ad9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -289,12 +289,7 @@ struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; -int sysctl_tcp_wmem[3] __read_mostly; -int sysctl_tcp_rmem[3] __read_mostly; - EXPORT_SYMBOL(sysctl_tcp_mem); -EXPORT_SYMBOL(sysctl_tcp_rmem); -EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); @@ -456,8 +451,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); } @@ -3636,13 +3631,13 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_wshare); + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; + init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[1] = 87380; + init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b54ee09cbcf7..9ceaa1fdc3ab 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -320,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -354,7 +354,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; - int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1; + int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -409,7 +409,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) - sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); + sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); } /* 4. Try to fixup all. It is made immediately after connection enters @@ -457,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); icsk->icsk_ack.quick = 0; - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + net->ipv4.sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -623,7 +624,8 @@ void tcp_rcv_space_adjust(struct sock *sk) while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); + rcvbuf = min(rcvwin / tp->advmss * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0162c577bb9c..1eac84b8044e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2409,8 +2409,8 @@ struct proto tcp_prot = { .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -2509,7 +2509,14 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; - + if (net != &init_net) { + memcpy(net->ipv4.sysctl_tcp_rmem, + init_net.ipv4.sysctl_tcp_rmem, + sizeof(init_net.ipv4.sysctl_tcp_rmem)); + memcpy(net->ipv4.sysctl_tcp_wmem, + init_net.ipv4.sysctl_tcp_wmem, + sizeof(init_net.ipv4.sysctl_tcp_wmem)); + } net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a9d917e4dad5..9b98d35aa0d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -220,7 +220,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, (*rcv_wscale) = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0e2529958b52..6bb98c93edfe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1940,8 +1940,8 @@ struct proto tcpv6_prot = { .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, -- cgit v1.2.3 From dd9d598c6657e2d2eed4497ff2c5d744015201dc Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 7 Nov 2017 16:33:08 +0800 Subject: ip_gre: add the support for i/o_flags update via netlink Now ip_gre is using ip_tunnel_changelink to update it's properties, but ip_tunnel_changelink in ip_tunnel doesn't update i/o_flags as a common function. o_flags updates would cause that tunnel->tun_hlen / hlen and dev->mtu / needed_headroom need to be recalculated, and dev->(hw_)features need to be updated as well. Therefore, we can't just add the update into ip_tunnel_update called in ip_tunnel_changelink, and it's also better not to touch ip_tunnel codes. This patch updates i/o_flags and calls ipgre_link_update to recalculate these gre properties after ip_tunnel_changelink does the common update. Note that since ipgre_link_update doesn't know the lower dev, it will update gre->hlen, dev->mtu and dev->needed_headroom with the value of 'new tun_hlen - old tun_hlen'. In this way, we can avoid many redundant codes, unlike ip6_gre. Reported-by: Jianlin Shi Signed-off-by: Xin Long Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c105a315b1a3..81e1e20af33b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -773,6 +773,30 @@ free_skb: return NETDEV_TX_OK; } +static void ipgre_link_update(struct net_device *dev, bool set_mtu) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int len; + + len = tunnel->tun_hlen; + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); + len = tunnel->tun_hlen - len; + tunnel->hlen = tunnel->hlen + len; + + dev->needed_headroom = dev->needed_headroom + len; + if (set_mtu) + dev->mtu = max_t(int, dev->mtu - len, 68); + + if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { + if (!(tunnel->parms.o_flags & TUNNEL_CSUM) || + tunnel->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + dev->features |= NETIF_F_LLTX; + } +} + static int ipgre_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -1307,9 +1331,9 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; __u32 fwmark = t->fwmark; + struct ip_tunnel_parm p; int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { @@ -1322,7 +1346,18 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_changelink(dev, tb, &p, fwmark); + + err = ip_tunnel_changelink(dev, tb, &p, fwmark); + if (err < 0) + return err; + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; + + if (strcmp(dev->rtnl_link_ops->kind, "erspan")) + ipgre_link_update(dev, !tb[IFLA_MTU]); + + return 0; } static size_t ipgre_get_size(const struct net_device *dev) -- cgit v1.2.3 From a0efab67aeb9406e30b6a19dd6dd335e24f6bd66 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 7 Nov 2017 16:33:09 +0800 Subject: ip_gre: add the support for i/o_flags update via ioctl As patch 'ip_gre: add the support for i/o_flags update via netlink' did for netlink, we also need to do the same job for these update via ioctl. This patch is to update i/o_flags and call ipgre_link_update to recalculate these gre properties after ip_tunnel_ioctl does the common update. Signed-off-by: Xin Long Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 81e1e20af33b..bb6239169b1a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -800,17 +800,19 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) static int ipgre_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - int err; struct ip_tunnel_parm p; + int err; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) return -EFAULT; + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || - ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) || + ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } + p.i_flags = gre_flags_to_tnl_flags(p.i_flags); p.o_flags = gre_flags_to_tnl_flags(p.o_flags); @@ -818,11 +820,22 @@ static int ipgre_tunnel_ioctl(struct net_device *dev, if (err) return err; + if (cmd == SIOCCHGTUNNEL) { + struct ip_tunnel *t = netdev_priv(dev); + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; + + if (strcmp(dev->rtnl_link_ops->kind, "erspan")) + ipgre_link_update(dev, true); + } + p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) return -EFAULT; + return 0; } -- cgit v1.2.3 From e0496cbbf8fb5d831e74b8cedb54236a49413679 Mon Sep 17 00:00:00 2001 From: Manish Kurup Date: Tue, 7 Nov 2017 15:48:15 -0500 Subject: act_vlan: Change stats update to use per-core stats The VLAN action maintains one set of stats across all cores, and uses a spinlock to synchronize updates to it from the same. Changed this to use a per-CPU stats context instead. This change will result in better performance. Acked-by: Jamal Hadi Salim Acked-by: Jiri Pirko Signed-off-by: Manish Kurup Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 16eb067a8d8f..b093badc1450 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -30,9 +30,10 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, int err; u16 tci; - spin_lock(&v->tcf_lock); tcf_lastuse_update(&v->tcf_tm); - bstats_update(&v->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb); + + spin_lock(&v->tcf_lock); action = v->tcf_action; /* Ensure 'data' points at mac_header prior calling vlan manipulating @@ -85,7 +86,8 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, drop: action = TC_ACT_SHOT; - v->tcf_qstats.drops++; + qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + unlock: if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); @@ -172,7 +174,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_vlan_ops, bind, false); + &act_vlan_ops, bind, true); if (ret) return ret; -- cgit v1.2.3 From 4c5b9d9642c859f7369338fc42c0f62f4151bef3 Mon Sep 17 00:00:00 2001 From: Manish Kurup Date: Tue, 7 Nov 2017 15:49:05 -0500 Subject: act_vlan: VLAN action rewrite to use RCU lock/unlock and update Using a spinlock in the VLAN action causes performance issues when the VLAN action is used on multiple cores. Rewrote the VLAN action to use RCU read locking for reads and updates instead. All functions now use an RCU dereferenced pointer to access the VLAN action context. Modified helper functions used by other modules, to use the RCU as opposed to directly accessing the structure. Acked-by: Jamal Hadi Salim Acked-by: Jiri Pirko Signed-off-by: Manish Kurup Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 75 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index b093badc1450..97f717a13ad5 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -26,6 +26,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_vlan *v = to_vlan(a); + struct tcf_vlan_params *p; int action; int err; u16 tci; @@ -33,24 +34,27 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&v->tcf_tm); bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb); - spin_lock(&v->tcf_lock); - action = v->tcf_action; - /* Ensure 'data' points at mac_header prior calling vlan manipulating * functions. */ if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - switch (v->tcfv_action) { + rcu_read_lock(); + + action = READ_ONCE(v->tcf_action); + + p = rcu_dereference(v->vlan_p); + + switch (p->tcfv_action) { case TCA_VLAN_ACT_POP: err = skb_vlan_pop(skb); if (err) goto drop; break; case TCA_VLAN_ACT_PUSH: - err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid | - (v->tcfv_push_prio << VLAN_PRIO_SHIFT)); + err = skb_vlan_push(skb, p->tcfv_push_proto, p->tcfv_push_vid | + (p->tcfv_push_prio << VLAN_PRIO_SHIFT)); if (err) goto drop; break; @@ -69,14 +73,14 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, goto drop; } /* replace the vid */ - tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid; + tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid; /* replace prio bits, if tcfv_push_prio specified */ - if (v->tcfv_push_prio) { + if (p->tcfv_push_prio) { tci &= ~VLAN_PRIO_MASK; - tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT; + tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT; } /* put updated tci as hwaccel tag */ - __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci); + __vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci); break; default: BUG(); @@ -89,10 +93,10 @@ drop: qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); unlock: + rcu_read_unlock(); if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); - spin_unlock(&v->tcf_lock); return action; } @@ -109,6 +113,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; + struct tcf_vlan_params *p, *p_old; struct tc_vlan *parm; struct tcf_vlan *v; int action; @@ -187,46 +192,67 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v = to_vlan(*a); - spin_lock_bh(&v->tcf_lock); - - v->tcfv_action = action; - v->tcfv_push_vid = push_vid; - v->tcfv_push_prio = push_prio; - v->tcfv_push_proto = push_proto; + ASSERT_RTNL(); + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + if (ovr) + tcf_idr_release(*a, bind); + return -ENOMEM; + } v->tcf_action = parm->action; - spin_unlock_bh(&v->tcf_lock); + p_old = rtnl_dereference(v->vlan_p); + + p->tcfv_action = action; + p->tcfv_push_vid = push_vid; + p->tcfv_push_prio = push_prio; + p->tcfv_push_proto = push_proto; + + rcu_assign_pointer(v->vlan_p, p); + + if (p_old) + kfree_rcu(p_old, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; } +static void tcf_vlan_cleanup(struct tc_action *a, int bind) +{ + struct tcf_vlan *v = to_vlan(a); + struct tcf_vlan_params *p; + + p = rcu_dereference_protected(v->vlan_p, 1); + kfree_rcu(p, rcu); +} + static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_vlan *v = to_vlan(a); + struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); struct tc_vlan opt = { .index = v->tcf_index, .refcnt = v->tcf_refcnt - ref, .bindcnt = v->tcf_bindcnt - bind, .action = v->tcf_action, - .v_action = v->tcfv_action, + .v_action = p->tcfv_action, }; struct tcf_t t; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if ((v->tcfv_action == TCA_VLAN_ACT_PUSH || - v->tcfv_action == TCA_VLAN_ACT_MODIFY) && - (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || + if ((p->tcfv_action == TCA_VLAN_ACT_PUSH || + p->tcfv_action == TCA_VLAN_ACT_MODIFY) && + (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, - v->tcfv_push_proto) || + p->tcfv_push_proto) || (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, - v->tcfv_push_prio)))) + p->tcfv_push_prio)))) goto nla_put_failure; tcf_tm_dump(&t, &v->tcf_tm); @@ -262,6 +288,7 @@ static struct tc_action_ops act_vlan_ops = { .act = tcf_vlan, .dump = tcf_vlan_dump, .init = tcf_vlan_init, + .cleanup = tcf_vlan_cleanup, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, .size = sizeof(struct tcf_vlan), -- cgit v1.2.3 From 9ef8690be13d8ae3130749fbcc0cc21e4e3f738c Mon Sep 17 00:00:00 2001 From: Samuel Mendoza-Jonas Date: Wed, 8 Nov 2017 16:30:44 +1100 Subject: net/ncsi: Improve general state logging The NCSI driver is mostly silent which becomes a headache when trying to determine what has occurred on the NCSI connection. This adds additional logging in a few key areas such as state transitions and calling out certain errors more visibly. Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/ncsi-aen.c | 15 +++++++++- net/ncsi/ncsi-manage.c | 76 +++++++++++++++++++++++++++++++++++++------------- net/ncsi/ncsi-rsp.c | 10 ++++++- 3 files changed, 80 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index f135938bf781..67e708e98ccf 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -73,6 +73,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, ncm->data[2] = data; ncm->data[4] = ntohl(lsc->oem_status); + netdev_info(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n", + nc->id, data & 0x1 ? "up" : "down"); + chained = !list_empty(&nc->link); state = nc->state; spin_unlock_irqrestore(&nc->lock, flags); @@ -145,6 +148,8 @@ static int ncsi_aen_handler_hncdsc(struct ncsi_dev_priv *ndp, ncm = &nc->modes[NCSI_MODE_LINK]; hncdsc = (struct ncsi_aen_hncdsc_pkt *)h; ncm->data[3] = ntohl(hncdsc->status); + netdev_info(ndp->ndev.dev, "NCSI: HNCDSC AEN - channel %u state %s\n", + nc->id, ncm->data[3] & 0x3 ? "up" : "down"); if (!list_empty(&nc->link) || nc->state != NCSI_CHANNEL_ACTIVE) { spin_unlock_irqrestore(&nc->lock, flags); @@ -212,10 +217,18 @@ int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb) } ret = ncsi_validate_aen_pkt(h, nah->payload); - if (ret) + if (ret) { + netdev_warn(ndp->ndev.dev, + "NCSI: 'bad' packet ignored for AEN type 0x%x\n", + h->type); goto out; + } ret = nah->handler(ndp, h); + if (ret) + netdev_err(ndp->ndev.dev, + "NCSI: Handler for AEN type 0x%x returned %d\n", + h->type, ret); out: consume_skb(skb); return ret; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 47baf914eec2..a2b904a718c6 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -229,6 +229,8 @@ static void ncsi_channel_monitor(unsigned long data) case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX: break; default: + netdev_err(ndp->ndev.dev, "NCSI Channel %d timed out!\n", + nc->id); if (!(ndp->flags & NCSI_DEV_HWA)) { ncsi_report_link(ndp, true); ndp->flags |= NCSI_DEV_RESHUFFLE; @@ -682,7 +684,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, index); if (!data) { netdev_err(ndp->ndev.dev, - "ncsi: failed to retrieve filter %d\n", index); + "NCSI: failed to retrieve filter %d\n", index); /* Set the VLAN id to 0 - this will still disable the entry in * the filter table, but we won't know what it was. */ @@ -692,7 +694,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, } netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "ncsi: removed vlan tag %u at index %d\n", + "NCSI: removed vlan tag %u at index %d\n", vid, index + 1); ncsi_remove_filter(nc, NCSI_FILTER_VLAN, index); @@ -718,7 +720,7 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, if (index < 0) { /* New tag to add */ netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "ncsi: new vlan id to set: %u\n", + "NCSI: new vlan id to set: %u\n", vlan->vid); break; } @@ -745,7 +747,7 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, } netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "ncsi: set vid %u in packet, index %u\n", + "NCSI: set vid %u in packet, index %u\n", vlan->vid, index + 1); nca->type = NCSI_PKT_CMD_SVF; nca->words[1] = vlan->vid; @@ -784,8 +786,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) nca.package = np->id; nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); - if (ret) + if (ret) { + netdev_err(ndp->ndev.dev, + "NCSI: Failed to transmit CMD_SP\n"); goto error; + } nd->state = ncsi_dev_state_config_cis; break; @@ -797,8 +802,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) nca.package = np->id; nca.channel = nc->id; ret = ncsi_xmit_cmd(&nca); - if (ret) + if (ret) { + netdev_err(ndp->ndev.dev, + "NCSI: Failed to transmit CMD_CIS\n"); goto error; + } nd->state = ncsi_dev_state_config_clear_vids; break; @@ -895,10 +903,16 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } ret = ncsi_xmit_cmd(&nca); - if (ret) + if (ret) { + netdev_err(ndp->ndev.dev, + "NCSI: Failed to transmit CMD %x\n", + nca.type); goto error; + } break; case ncsi_dev_state_config_done: + netdev_printk(KERN_DEBUG, ndp->ndev.dev, + "NCSI: channel %u config done\n", nc->id); spin_lock_irqsave(&nc->lock, flags); if (nc->reconfigure_needed) { /* This channel's configuration has been updated @@ -925,6 +939,9 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else { hot_nc = NULL; nc->state = NCSI_CHANNEL_INACTIVE; + netdev_warn(ndp->ndev.dev, + "NCSI: channel %u link down after config\n", + nc->id); } spin_unlock_irqrestore(&nc->lock, flags); @@ -937,8 +954,8 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) ncsi_process_next_channel(ndp); break; default: - netdev_warn(dev, "Wrong NCSI state 0x%x in config\n", - nd->state); + netdev_alert(dev, "Wrong NCSI state 0x%x in config\n", + nd->state); } return; @@ -990,10 +1007,17 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) } if (!found) { + netdev_warn(ndp->ndev.dev, + "NCSI: No channel found with link\n"); ncsi_report_link(ndp, true); return -ENODEV; } + ncm = &found->modes[NCSI_MODE_LINK]; + netdev_printk(KERN_DEBUG, ndp->ndev.dev, + "NCSI: Channel %u added to queue (link %s)\n", + found->id, ncm->data[2] & 0x1 ? "up" : "down"); + out: spin_lock_irqsave(&ndp->lock, flags); list_add_tail_rcu(&found->link, &ndp->channel_queue); @@ -1055,6 +1079,8 @@ static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp) /* We can have no channels in extremely case */ if (list_empty(&ndp->channel_queue)) { + netdev_err(ndp->ndev.dev, + "NCSI: No available channels for HWA\n"); ncsi_report_link(ndp, false); return -ENOENT; } @@ -1223,6 +1249,9 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) return; error: + netdev_err(ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during probe\n", + nca.type); ncsi_report_link(ndp, true); } @@ -1276,10 +1305,14 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) switch (old_state) { case NCSI_CHANNEL_INACTIVE: ndp->ndev.state = ncsi_dev_state_config; + netdev_info(ndp->ndev.dev, "NCSI: configuring channel %u\n", + nc->id); ncsi_configure_channel(ndp); break; case NCSI_CHANNEL_ACTIVE: ndp->ndev.state = ncsi_dev_state_suspend; + netdev_info(ndp->ndev.dev, "NCSI: suspending channel %u\n", + nc->id); ncsi_suspend_channel(ndp); break; default: @@ -1299,6 +1332,8 @@ out: return ncsi_choose_active_channel(ndp); } + netdev_printk(KERN_DEBUG, ndp->ndev.dev, + "NCSI: No more channels to process\n"); ncsi_report_link(ndp, false); return -ENODEV; } @@ -1390,7 +1425,7 @@ static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) ncsi_dev_state_config || !list_empty(&nc->link)) { netdev_printk(KERN_DEBUG, nd->dev, - "ncsi: channel %p marked dirty\n", + "NCSI: channel %p marked dirty\n", nc); nc->reconfigure_needed = true; } @@ -1410,7 +1445,7 @@ static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) spin_unlock_irqrestore(&ndp->lock, flags); netdev_printk(KERN_DEBUG, nd->dev, - "ncsi: kicked channel %p\n", nc); + "NCSI: kicked channel %p\n", nc); n++; } } @@ -1431,7 +1466,7 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) nd = ncsi_find_dev(dev); if (!nd) { - netdev_warn(dev, "ncsi: No net_device?\n"); + netdev_warn(dev, "NCSI: No net_device?\n"); return 0; } @@ -1442,7 +1477,7 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) n_vids++; if (vlan->vid == vid) { netdev_printk(KERN_DEBUG, dev, - "vid %u already registered\n", vid); + "NCSI: vid %u already registered\n", vid); return 0; } } @@ -1461,7 +1496,7 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) vlan->vid = vid; list_add_rcu(&vlan->list, &ndp->vlan_vids); - netdev_printk(KERN_DEBUG, dev, "Added new vid %u\n", vid); + netdev_printk(KERN_DEBUG, dev, "NCSI: Added new vid %u\n", vid); found = ncsi_kick_channels(ndp) != 0; @@ -1481,7 +1516,7 @@ int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) nd = ncsi_find_dev(dev); if (!nd) { - netdev_warn(dev, "ncsi: no net_device?\n"); + netdev_warn(dev, "NCSI: no net_device?\n"); return 0; } @@ -1491,14 +1526,14 @@ int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) list_for_each_entry_safe(vlan, tmp, &ndp->vlan_vids, list) if (vlan->vid == vid) { netdev_printk(KERN_DEBUG, dev, - "vid %u found, removing\n", vid); + "NCSI: vid %u found, removing\n", vid); list_del_rcu(&vlan->list); found = true; kfree(vlan); } if (!found) { - netdev_err(dev, "ncsi: vid %u wasn't registered!\n", vid); + netdev_err(dev, "NCSI: vid %u wasn't registered!\n", vid); return -EINVAL; } @@ -1581,10 +1616,12 @@ int ncsi_start_dev(struct ncsi_dev *nd) return 0; } - if (ndp->flags & NCSI_DEV_HWA) + if (ndp->flags & NCSI_DEV_HWA) { + netdev_info(ndp->ndev.dev, "NCSI: Enabling HWA mode\n"); ret = ncsi_enable_hwa(ndp); - else + } else { ret = ncsi_choose_active_channel(ndp); + } return ret; } @@ -1615,6 +1652,7 @@ void ncsi_stop_dev(struct ncsi_dev *nd) } } + netdev_printk(KERN_DEBUG, ndp->ndev.dev, "NCSI: Stopping device\n"); ncsi_report_link(ndp, true); } EXPORT_SYMBOL_GPL(ncsi_stop_dev); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 927dad4759d1..58186c4102f0 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1032,11 +1032,19 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, if (payload < 0) payload = ntohs(hdr->length); ret = ncsi_validate_rsp_pkt(nr, payload); - if (ret) + if (ret) { + netdev_warn(ndp->ndev.dev, + "NCSI: 'bad' packet ignored for type 0x%x\n", + hdr->type); goto out; + } /* Process the packet */ ret = nrh->handler(nr); + if (ret) + netdev_err(ndp->ndev.dev, + "NCSI: Handler for packet type 0x%x returned %d\n", + hdr->type, ret); out: ncsi_free_request(nr); return ret; -- cgit v1.2.3 From 04bad8bda9e25afe676a6f4452f3b304c1fdea16 Mon Sep 17 00:00:00 2001 From: Samuel Mendoza-Jonas Date: Wed, 8 Nov 2017 16:30:45 +1100 Subject: net/ncsi: Don't return error on normal response Several response handlers return EBUSY if the data corresponding to the command/response pair is already set. There is no reason to return an error here; the channel is advertising something as enabled because we told it to enable it, and it's possible that the feature has been enabled previously. Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/ncsi-rsp.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 58186c4102f0..efd933ff5570 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -146,7 +146,7 @@ static int ncsi_rsp_handler_ec(struct ncsi_request *nr) ncm = &nc->modes[NCSI_MODE_ENABLE]; if (ncm->enable) - return -EBUSY; + return 0; ncm->enable = 1; return 0; @@ -173,7 +173,7 @@ static int ncsi_rsp_handler_dc(struct ncsi_request *nr) ncm = &nc->modes[NCSI_MODE_ENABLE]; if (!ncm->enable) - return -EBUSY; + return 0; ncm->enable = 0; return 0; @@ -217,7 +217,7 @@ static int ncsi_rsp_handler_ecnt(struct ncsi_request *nr) ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; if (ncm->enable) - return -EBUSY; + return 0; ncm->enable = 1; return 0; @@ -239,7 +239,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr) ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; if (!ncm->enable) - return -EBUSY; + return 0; ncm->enable = 1; return 0; @@ -263,7 +263,7 @@ static int ncsi_rsp_handler_ae(struct ncsi_request *nr) /* Check if the AEN has been enabled */ ncm = &nc->modes[NCSI_MODE_AEN]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to AEN configuration */ cmd = (struct ncsi_cmd_ae_pkt *)skb_network_header(nr->cmd); @@ -382,7 +382,7 @@ static int ncsi_rsp_handler_ev(struct ncsi_request *nr) /* Check if VLAN mode has been enabled */ ncm = &nc->modes[NCSI_MODE_VLAN]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to VLAN mode */ cmd = (struct ncsi_cmd_ev_pkt *)skb_network_header(nr->cmd); @@ -409,7 +409,7 @@ static int ncsi_rsp_handler_dv(struct ncsi_request *nr) /* Check if VLAN mode has been enabled */ ncm = &nc->modes[NCSI_MODE_VLAN]; if (!ncm->enable) - return -EBUSY; + return 0; /* Update to VLAN mode */ ncm->enable = 0; @@ -455,13 +455,10 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr) bitmap = &ncf->bitmap; if (cmd->at_e & 0x1) { - if (test_and_set_bit(cmd->index, bitmap)) - return -EBUSY; + set_bit(cmd->index, bitmap); memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6); } else { - if (!test_and_clear_bit(cmd->index, bitmap)) - return -EBUSY; - + clear_bit(cmd->index, bitmap); memset(ncf->data + 6 * cmd->index, 0, 6); } @@ -485,7 +482,7 @@ static int ncsi_rsp_handler_ebf(struct ncsi_request *nr) /* Check if broadcast filter has been enabled */ ncm = &nc->modes[NCSI_MODE_BC]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to broadcast filter mode */ cmd = (struct ncsi_cmd_ebf_pkt *)skb_network_header(nr->cmd); @@ -511,7 +508,7 @@ static int ncsi_rsp_handler_dbf(struct ncsi_request *nr) /* Check if broadcast filter isn't enabled */ ncm = &nc->modes[NCSI_MODE_BC]; if (!ncm->enable) - return -EBUSY; + return 0; /* Update to broadcast filter mode */ ncm->enable = 0; @@ -538,7 +535,7 @@ static int ncsi_rsp_handler_egmf(struct ncsi_request *nr) /* Check if multicast filter has been enabled */ ncm = &nc->modes[NCSI_MODE_MC]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to multicast filter mode */ cmd = (struct ncsi_cmd_egmf_pkt *)skb_network_header(nr->cmd); @@ -564,7 +561,7 @@ static int ncsi_rsp_handler_dgmf(struct ncsi_request *nr) /* Check if multicast filter has been enabled */ ncm = &nc->modes[NCSI_MODE_MC]; if (!ncm->enable) - return -EBUSY; + return 0; /* Update to multicast filter mode */ ncm->enable = 0; @@ -591,7 +588,7 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) /* Check if flow control has been enabled */ ncm = &nc->modes[NCSI_MODE_FC]; if (ncm->enable) - return -EBUSY; + return 0; /* Update to flow control mode */ cmd = (struct ncsi_cmd_snfc_pkt *)skb_network_header(nr->cmd); -- cgit v1.2.3 From 2210d6b2f287d738eddf6b75f432126ce05450f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 7 Nov 2017 21:52:09 -0800 Subject: net: ipv6: sysctl to specify IPv6 ND traffic class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a per-device sysctl to specify the default traffic class to use for kernel originated IPv6 Neighbour Discovery packets. Currently this includes: - Router Solicitation (ICMPv6 type 133) ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Solicitation (ICMPv6 type 135) ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Advertisement (ICMPv6 type 136) ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr() - Redirect (ICMPv6 type 137) ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr() and if the kernel ever gets around to generating RA's, it would presumably also include: - Router Advertisement (ICMPv6 type 134) (radvd daemon could pick up on the kernel setting and use it) Interface drivers may examine the Traffic Class value and translate the DiffServ Code Point into a link-layer appropriate traffic prioritization scheme. An example of mapping IETF DSCP values to IEEE 802.11 User Priority values can be found here: https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11 The expected primary use case is to properly prioritize ND over wifi. Testing: jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 0 jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 255 jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 34 jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1 tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24) jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 24, tgt is jzem22.pgc, Flags [solicited] (based on original change written by Erik Kline, with minor changes) v2: fix 'suspicious rcu_dereference_check() usage' by explicitly grabbing the rcu_read_lock. Cc: Lorenzo Colitti Signed-off-by: Erik Kline Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 11 +++++++++++ net/ipv6/ndisc.c | 9 ++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6233e06fa35c..a6dffd65eb9d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5059,6 +5059,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode; array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; + array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass; } static inline size_t inet6_ifla6_size(void) @@ -5986,6 +5987,7 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, } static int minus_one = -1; +static const int zero = 0; static const int one = 1; static const int two_five_five = 255; @@ -6356,6 +6358,15 @@ static const struct ctl_table addrconf_sysctl[] = { .mode = 0644, .proc_handler = addrconf_sysctl_disable_policy, }, + { + .procname = "ndisc_tclass", + .data = &ipv6_devconf.ndisc_tclass, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&zero, + .extra2 = (void *)&two_five_five, + }, { /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index f9c3ffe04382..b3cea200c85e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -427,12 +427,19 @@ static void ip6_nd_hdr(struct sk_buff *skb, int hop_limit, int len) { struct ipv6hdr *hdr; + struct inet6_dev *idev; + unsigned tclass; + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + tclass = idev ? idev->cnf.ndisc_tclass : 0; + rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, 0, 0); + ip6_flow_hdr(hdr, tclass, 0); hdr->payload_len = htons(len); hdr->nexthdr = IPPROTO_ICMPV6; -- cgit v1.2.3 From 8d6e79d3ce13e34957de87f7584cbf1bcde74c57 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 8 Nov 2017 09:59:26 +0100 Subject: tipc: improve link resiliency when rps is activated Currently, the TIPC RPS dissector is based only on the incoming packets' source node address, hence steering all traffic from a node to the same core. We have seen that this makes the links vulnerable to starvation and unnecessary resets when we turn down the link tolerance to very low values. To reduce the risk of this happening, we exempt probe and probe replies packets from the convergence to one core per source node. Instead, we do the opposite, - we try to diverge those packets across as many cores as possible, by randomizing the flow selector key. To make such packets identifiable to the dissector, we add a new 'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is set both for PROBE and PROBE_REPLY messages, and only for those. It should be noted that these packets are not part of any flow anyway, and only constitute a minuscule fraction of all packets sent across a link. Hence, there is no risk that this will affect overall performance. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 30 +++++++++++++++--------------- net/tipc/link.c | 26 +++++++++++++++----------- net/tipc/msg.h | 10 ++++++++++ 3 files changed, 40 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1f5caafb4492..15ce30063765 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -772,23 +773,22 @@ proto_again: break; } case htons(ETH_P_TIPC): { - struct { - __be32 pre[3]; - __be32 srcnode; - } *hdr, _hdr; - hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + struct tipc_basic_hdr *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), + data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { + FLOW_DISSECTOR_KEY_TIPC)) { key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_TIPC_ADDRS, + FLOW_DISSECTOR_KEY_TIPC, target_container); - key_addrs->tipcaddrs.srcnode = hdr->srcnode; - key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS; + key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); + key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; @@ -1024,8 +1024,8 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow) case FLOW_DISSECTOR_KEY_IPV6_ADDRS: diff -= sizeof(flow->addrs.v6addrs); break; - case FLOW_DISSECTOR_KEY_TIPC_ADDRS: - diff -= sizeof(flow->addrs.tipcaddrs); + case FLOW_DISSECTOR_KEY_TIPC: + diff -= sizeof(flow->addrs.tipckey); break; } return (sizeof(*flow) - diff) / sizeof(u32); @@ -1039,8 +1039,8 @@ __be32 flow_get_u32_src(const struct flow_keys *flow) case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.src); - case FLOW_DISSECTOR_KEY_TIPC_ADDRS: - return flow->addrs.tipcaddrs.srcnode; + case FLOW_DISSECTOR_KEY_TIPC: + return flow->addrs.tipckey.key; default: return 0; } @@ -1321,8 +1321,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { - .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS, - .offset = offsetof(struct flow_keys, addrs.tipcaddrs), + .key_id = FLOW_DISSECTOR_KEY_TIPC, + .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, diff --git a/net/tipc/link.c b/net/tipc/link.c index 870b9b8f877a..6bce0b1117bd 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -239,7 +239,8 @@ static int link_is_up(struct tipc_link *l) static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq); static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, - u16 rcvgap, int tolerance, int priority, + bool probe_reply, u16 rcvgap, + int tolerance, int priority, struct sk_buff_head *xmitq); static void link_print(struct tipc_link *l, const char *str); static int tipc_link_build_nack_msg(struct tipc_link *l, @@ -773,7 +774,7 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) } if (state || probe || setup) - tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, xmitq); + tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, 0, xmitq); return rc; } @@ -1174,7 +1175,7 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq) /* Unicast ACK */ l->rcv_unacked = 0; l->stats.sent_acks++; - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); return 0; } @@ -1188,7 +1189,7 @@ void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq) if (l->state == LINK_ESTABLISHING) mtyp = ACTIVATE_MSG; - tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, xmitq); + tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, 0, xmitq); /* Inform peer that this endpoint is going down if applicable */ skb = skb_peek_tail(xmitq); @@ -1215,7 +1216,7 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, } if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV)) - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); return 0; } @@ -1289,7 +1290,8 @@ drop: } static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, - u16 rcvgap, int tolerance, int priority, + bool probe_reply, u16 rcvgap, + int tolerance, int priority, struct sk_buff_head *xmitq) { struct tipc_link *bcl = l->bc_rcvlink; @@ -1337,6 +1339,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_seq_gap(hdr, rcvgap); msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); msg_set_probe(hdr, probe); + msg_set_is_keepalive(hdr, probe || probe_reply); tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id); msg_set_size(hdr, INT_H_SIZE + dlen); skb_trim(skb, INT_H_SIZE + dlen); @@ -1442,6 +1445,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, u16 rcv_nxt = l->rcv_nxt; u16 dlen = msg_data_sz(hdr); int mtyp = msg_type(hdr); + bool reply = msg_probe(hdr); void *data; char *if_name; int rc = 0; @@ -1528,9 +1532,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* Send NACK if peer has sent pkts we haven't received yet */ if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) rcvgap = peers_snd_nxt - l->rcv_nxt; - if (rcvgap || (msg_probe(hdr))) - tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap, - 0, 0, xmitq); + if (rcvgap || reply) + tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, + rcvgap, 0, 0, xmitq); tipc_link_release_pkts(l, ack); /* If NACK, retransmit will now start at right position */ @@ -2122,14 +2126,14 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq) { l->tolerance = tol; - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, tol, 0, xmitq); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq); } void tipc_link_set_prio(struct tipc_link *l, u32 prio, struct sk_buff_head *xmitq) { l->priority = prio; - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, prio, xmitq); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, prio, xmitq); } void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index cedf811317fb..bf8f57ccc70c 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -226,6 +226,16 @@ static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) msg_set_bits(m, 0, 19, 1, d); } +static inline int msg_is_keepalive(struct tipc_msg *m) +{ + return msg_bits(m, 0, 19, 1); +} + +static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 19, 1, d); +} + static inline int msg_src_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); -- cgit v1.2.3 From 713bafea92920103cd3d361657406cf04d0e22dd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 8 Nov 2017 13:01:26 -0800 Subject: tcp: retire FACK loss detection FACK loss detection has been disabled by default and the successor RACK subsumed FACK and can handle reordering better. This patch removes FACK to simplify TCP loss recovery. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- net/ipv4/proc.c | 1 - net/ipv4/tcp.c | 2 -- net/ipv4/tcp_input.c | 53 +++++++----------------------------------------- net/ipv4/tcp_metrics.c | 4 +--- net/ipv4/tcp_minisocks.c | 5 +---- net/ipv4/tcp_output.c | 5 +---- 6 files changed, 10 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 127153f1ed8a..9f37c4727861 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -212,7 +212,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), - SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER), SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bc71a27d5ad9..337555076043 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2509,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk, return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; - if (sock_net(sk)->ipv4.sysctl_tcp_fack) - tcp_enable_fack(tp); break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9ceaa1fdc3ab..487e181cff86 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -842,18 +842,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -/* - * Packet counting of FACK is based on in-order assumptions, therefore TCP - * disables it when reordering is detected - */ -void tcp_disable_fack(struct tcp_sock *tp) -{ - /* RFC3517 uses different metric in lost marker => reset on change */ - if (tcp_is_fack(tp)) - tp->lost_skb_hint = NULL; - tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; -} - /* Take a notice that peer is sending D-SACKs */ static void tcp_dsack_seen(struct tcp_sock *tp) { @@ -881,7 +869,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif - tcp_disable_fack(tp); } tp->rack.reord = 1; @@ -891,8 +878,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric, mib_idx = LINUX_MIB_TCPTSREORDER; else if (tcp_is_reno(tp)) mib_idx = LINUX_MIB_TCPRENOREORDER; - else if (tcp_is_fack(tp)) - mib_idx = LINUX_MIB_TCPFACKREORDER; else mib_idx = LINUX_MIB_TCPSACKREORDER; @@ -970,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. - * A''. Its FACK modification, head until snd.fack is lost. * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. @@ -1248,7 +1232,7 @@ static u8 tcp_sacktag_one(struct sock *sk, fack_count += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ - if (!tcp_is_fack(tp) && tp->lost_skb_hint && + if (tp->lost_skb_hint && before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; @@ -2051,10 +2035,6 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp) * counter when SACK is enabled (without SACK, sacked_out is used for * that purpose). * - * Instead, with FACK TCP uses fackets_out that includes both SACKed - * segments up to the highest received SACK block so far and holes in - * between them. - * * With reordering, holes may still be in flight, so RFC3517 recovery * uses pure sacked_out (total number of SACKed segments) even though * it violates the RFC that uses duplicate ACKs, often these are equal @@ -2064,10 +2044,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp) */ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) { - return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; + return tp->sacked_out + 1; } -/* Linux NewReno/SACK/FACK/ECN state machine. +/* Linux NewReno/SACK/ECN state machine. * -------------------------------------- * * "Open" Normal state, no dubious events, fast path. @@ -2132,16 +2112,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) * dynamically measured and adjusted. This is implemented in * tcp_rack_mark_lost. * - * FACK (Disabled by default. Subsumbed by RACK): - * It is the simplest heuristics. As soon as we decided - * that something is lost, we decide that _all_ not SACKed - * packets until the most forward SACK are lost. I.e. - * lost_out = fackets_out - sacked_out and left_out = fackets_out. - * It is absolutely correct estimate, if network does not reorder - * packets. And it loses any connection to reality when reordering - * takes place. We use FACK by default until reordering - * is suspected on the path to this destination. - * * If the receiver does not support SACK: * * NewReno (RFC6582): in Recovery we assume that one segment @@ -2190,7 +2160,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) } /* Detect loss in event "A" above by marking head of queue up as lost. - * For FACK or non-SACK(Reno) senders, the first "packets" number of segments + * For non-SACK(Reno) senders, the first "packets" number of segments * are considered lost. For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. @@ -2226,12 +2196,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; oldcnt = cnt; - if (tcp_is_fack(tp) || tcp_is_reno(tp) || + if (tcp_is_reno(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) cnt += tcp_skb_pcount(skb); if (cnt > packets) { - if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + if (tcp_is_sack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || (oldcnt >= packets)) break; @@ -2262,11 +2232,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) if (tcp_is_reno(tp)) { tcp_mark_head_lost(sk, 1, 1); - } else if (tcp_is_fack(tp)) { - int lost = tp->fackets_out - tp->reordering; - if (lost <= 0) - lost = 1; - tcp_mark_head_lost(sk, lost, 0); } else { int sacked_upto = tp->sacked_out - tp->reordering; if (sacked_upto >= 0) @@ -3199,8 +3164,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (reord < prior_fackets && reord <= tp->fackets_out) tcp_update_reordering(sk, tp->fackets_out - reord, 0); - delta = tcp_is_fack(tp) ? pkts_acked : - prior_sacked - tp->sacked_out; + delta = prior_sacked - tp->sacked_out; tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } @@ -5708,9 +5672,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack) - tcp_enable_fack(tp); - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 9d5ddebfd831..7097f92d16e5 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -470,10 +470,8 @@ void tcp_init_metrics(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); - if (val && tp->reordering != val) { - tcp_disable_fack(tp); + if (val && tp->reordering != val) tp->reordering = val; - } crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4bb86580decd..326c9282bf94 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -509,10 +509,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { - if (sock_net(sk)->ipv4.sysctl_tcp_fack) - tcp_enable_fack(newtp); - } + newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9b98d35aa0d8..094c429b4401 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1257,7 +1257,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && - (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) tp->lost_cnt_hint -= decr; tcp_verify_left_out(tp); @@ -2961,9 +2961,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. - * If doing SACK, the first ACK which comes back for a timeout - * based retransmit packet might feed us FACK information again. - * If so, we use it to avoid unnecessarily retransmissions. */ void tcp_xmit_retransmit_queue(struct sock *sk) { -- cgit v1.2.3 From 737ff314563ca27f044f9a3a041e9d42491ef7ce Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 8 Nov 2017 13:01:27 -0800 Subject: tcp: use sequence distance to detect reordering Replace the reordering distance measurement in packet unit with sequence based approach. Previously it trackes the number of "packets" toward the forward ACK (i.e. highest sacked sequence)in a state variable "fackets_out". Precisely measuring reordering degree on packet distance has not much benefit, as the degree constantly changes by factors like path, load, and congestion window. It is also complicated and prone to arcane bugs. This patch replaces with sequence-based approach that's much simpler. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 1 - net/ipv4/tcp_input.c | 155 +++++++++++++++++++++-------------------------- net/ipv4/tcp_minisocks.c | 1 - net/ipv4/tcp_output.c | 17 ------ 4 files changed, 68 insertions(+), 106 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 337555076043..bf97317e6c97 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2977,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; - info->tcpi_fackets = tp->fackets_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 487e181cff86..94d729be42a9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -849,39 +849,39 @@ static void tcp_dsack_seen(struct tcp_sock *tp) tp->rack.dsack_seen = 1; } -static void tcp_update_reordering(struct sock *sk, const int metric, - const int ts) +/* It's reordering when higher sequence was delivered (i.e. sacked) before + * some lower never-retransmitted sequence ("low_seq"). The maximum reordering + * distance is approximated in full-mss packet distance ("reordering"). + */ +static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, + const int ts) { struct tcp_sock *tp = tcp_sk(sk); - int mib_idx; + const u32 mss = tp->mss_cache; + u32 fack, metric; - if (WARN_ON_ONCE(metric < 0)) + fack = tcp_highest_sack_seq(tp); + if (!before(low_seq, fack)) return; - if (metric > tp->reordering) { - tp->reordering = min(sock_net(sk)->ipv4.sysctl_tcp_max_reordering, metric); - + metric = fack - low_seq; + if ((metric > tp->reordering * mss) && mss) { #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, - tp->fackets_out, + 0, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif + tp->reordering = min_t(u32, (metric + mss - 1) / mss, + sock_net(sk)->ipv4.sysctl_tcp_max_reordering); } tp->rack.reord = 1; - /* This exciting event is worth to be remembered. 8) */ - if (ts) - mib_idx = LINUX_MIB_TCPTSREORDER; - else if (tcp_is_reno(tp)) - mib_idx = LINUX_MIB_TCPRENOREORDER; - else - mib_idx = LINUX_MIB_TCPSACKREORDER; - - NET_INC_STATS(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), + ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } /* This must be called before lost_out is incremented */ @@ -1097,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; + u32 reord; /* Timestamps for earliest and latest never-retransmitted segment * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. @@ -1174,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); - int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; - if (sacked & TCPCB_SACKED_ACKED) - state->reord = min(fack_count, state->reord); + if ((sacked & TCPCB_SACKED_ACKED) && + before(start_seq, state->reord)) + state->reord = start_seq; } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ @@ -1208,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk, * which was in hole. It is reordering. */ if (before(start_seq, - tcp_highest_sack_seq(tp))) - state->reord = min(fack_count, - state->reord); + tcp_highest_sack_seq(tp)) && + before(start_seq, state->reord)) + state->reord = start_seq; + if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; if (state->first_sackt == 0) @@ -1229,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; tp->delivered += pcount; /* Out-of-order packets delivered */ - fack_count += pcount; - /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (tp->lost_skb_hint && before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; - - if (fack_count > tp->fackets_out) - tp->fackets_out = fack_count; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1484,7 +1479,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, } out: - state->fack_count += pcount; return prev; noop: @@ -1563,8 +1557,6 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_highest_sack_seq(tp))) tcp_advance_highest_sack(sk, skb); } - - state->fack_count += tcp_skb_pcount(skb); } return skb; } @@ -1575,7 +1567,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, { struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; struct sk_buff *skb; - int unack_bytes; while (*p) { parent = *p; @@ -1588,12 +1579,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, p = &parent->rb_right; continue; } - - state->fack_count = 0; - unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una; - if (state->mss_now && unack_bytes > 0) - state->fack_count = unack_bytes / state->mss_now; - return skb; } return NULL; @@ -1651,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, int first_sack_index; state->flag = 0; - state->reord = tp->packets_out; + state->reord = tp->snd_nxt; - if (!tp->sacked_out) { - if (WARN_ON(tp->fackets_out)) - tp->fackets_out = 0; + if (!tp->sacked_out) tcp_highest_sack_reset(sk); - } found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); @@ -1729,7 +1711,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } state->mss_now = tcp_current_mss(sk); - state->fack_count = 0; skb = NULL; i = 0; @@ -1787,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state->fack_count = tp->fackets_out; cache++; goto walk; } @@ -1802,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state->fack_count = tp->fackets_out; } skb = tcp_sacktag_skip(skb, sk, state, start_seq); @@ -1822,9 +1801,8 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - if ((state->reord < tp->fackets_out) && - ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) - tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker) + tcp_check_sack_reordering(sk, state->reord, 0); tcp_verify_left_out(tp); out: @@ -1862,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) static void tcp_check_reno_reordering(struct sock *sk, const int addend) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_limit_reno_sacked(tp)) - tcp_update_reordering(sk, tp->packets_out + addend, 0); + + if (!tcp_limit_reno_sacked(tp)) + return; + + tp->reordering = min_t(u32, tp->packets_out + addend, + sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } /* Emulate SACKs for SACKless connection: account for a new dupack. */ @@ -1909,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp) tp->lost_out = 0; tp->undo_marker = 0; tp->undo_retrans = -1; - tp->fackets_out = 0; tp->sacked_out = 0; } @@ -1959,7 +1941,6 @@ void tcp_enter_loss(struct sock *sk) if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; - tp->fackets_out = 0; } tcp_clear_all_retrans_hints(tp); @@ -2026,11 +2007,6 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag) return false; } -static inline int tcp_fackets_out(const struct tcp_sock *tp) -{ - return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; -} - /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs * counter when SACK is enabled (without SACK, sacked_out is used for * that purpose). @@ -2701,15 +2677,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, } /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, const int acked) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && tcp_packet_delayed(tp)) { /* Plain luck! Hole if filled with delayed - * packet, rather than with a retransmit. + * packet, rather than with a retransmit. Check reordering. */ - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + tcp_check_sack_reordering(sk, prior_snd_una, 1); /* We are getting evidence that the reordering degree is higher * than we realized. If there are no retransmits out then we @@ -2745,6 +2721,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) } } +static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2757,19 +2741,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, const int acked, +static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && - (tcp_fackets_out(tp) > tp->reordering)); + tcp_force_fast_retransmit(sk)); if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (!tp->sacked_out && tp->fackets_out) - tp->fackets_out = 0; /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ @@ -2816,11 +2798,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked)) + if (tcp_try_undo_partial(sk, prior_snd_una)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || - tcp_fackets_out(tp) > tp->reordering; + tcp_force_fast_retransmit(sk); } if (tcp_try_undo_dsack(sk)) { tcp_try_keep_open(sk); @@ -3030,15 +3012,15 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, int *acked, +static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, + u32 prior_snd_una, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; - u32 reord = tp->packets_out; + u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */ struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; @@ -3053,6 +3035,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + const u32 start_seq = scb->seq; u8 sacked = scb->sacked; u32 acked_pcount; @@ -3083,7 +3066,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = last_ackt; last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; - reord = min(pkts_acked, reord); + if (before(start_seq, reord)) + reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; } @@ -3161,15 +3145,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets && reord <= tp->fackets_out) - tcp_update_reordering(sk, tp->fackets_out - reord, 0); + if (before(reord, prior_fack)) + tcp_check_sack_reordering(sk, reord, 0); delta = prior_sacked - tp->sacked_out; tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } - - tp->fackets_out -= min(pkts_acked, tp->fackets_out); - } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent @@ -3210,7 +3191,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } } #endif - *acked = pkts_acked; return flag; } @@ -3519,12 +3499,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_fackets; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; - int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs; @@ -3556,7 +3535,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_retransmits = 0; } - prior_fackets = tp->fackets_out; + prior_fack = tcp_highest_sack_seq(tp); rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet @@ -3612,8 +3591,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state); + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); tcp_rack_update_reo_wnd(sk, &rs); @@ -3625,7 +3603,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) @@ -3641,7 +3620,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3663,7 +3643,8 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); tcp_xmit_recovery(sk, rexmit); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 326c9282bf94..e36eff0403f4 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -475,7 +475,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->packets_out = 0; newtp->retrans_out = 0; newtp->sacked_out = 0; - newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->tlp_high_seq = 0; newtp->lsndtime = tcp_jiffies32; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 094c429b4401..0256f7a41041 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1218,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) } } -/* When a modification to fackets out becomes necessary, we need to check - * skb is counted to fackets_out or not. - */ -static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, - int decr) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->sacked_out || tcp_is_reno(tp)) - return; - - if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) - tp->fackets_out -= decr; -} - /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ @@ -1253,8 +1238,6 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); - tcp_adjust_fackets_out(sk, skb, decr); - if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) -- cgit v1.2.3 From 765924e362d12f87786060b98a49abd91e11ea96 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 9 Nov 2017 08:29:52 +0900 Subject: l2tp: don't close sessions in l2tp_tunnel_destruct() Sessions are already removed by the proto ->destroy() handlers, and since commit f3c66d4e144a ("l2tp: prevent creation of sessions on terminated tunnels"), we're guaranteed that no new session can be created afterwards. Furthermore, l2tp_tunnel_closeall() can sleep when there are sessions left to close. So we really shouldn't call it in a ->sk_destruct() handler, as it can be used from atomic context. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 7c8d1eb757a5..350fcd39ebd8 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1246,8 +1246,6 @@ static void l2tp_tunnel_destruct(struct sock *sk) list_del_rcu(&tunnel->list); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - l2tp_tunnel_closeall(tunnel); - tunnel->sock = NULL; l2tp_tunnel_dec_refcount(tunnel); -- cgit v1.2.3 From 2d919149686e2da001e58e8a2ed49a0200c9db78 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 8 Nov 2017 21:38:28 -0600 Subject: net: decnet: dn_table: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 115106 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/decnet/dn_table.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 08667f68e601..f0710b5d037d 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -156,6 +156,7 @@ static void dn_rehash_zone(struct dn_zone *dz) default: printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n", old_divisor); + /* fall through */ case 256: new_divisor = 1024; new_hashmask = 0x3FF; -- cgit v1.2.3 From 5290ada4a2e6a9dec00e849a49af8f7bf7462449 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 9 Nov 2017 00:03:15 -0800 Subject: sock: Remove the global prot_inuse counter. The per-cpu counter for init_net is prepared in core_initcall. The patch 7d720c3e ("percpu: add __percpu sparse annotations to net") and d6d9ca0fe ("net: this_cpu_xxx conversions") optimize the routines. Then remove the old counter. Cc: Pavel Emelyanov Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/core/sock.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index c59bcf90d905..57bbd6040eb6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3044,7 +3044,6 @@ struct prot_inuse { static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); -#ifdef CONFIG_NET_NS void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); @@ -3088,27 +3087,6 @@ static __init int net_inuse_init(void) } core_initcall(net_inuse_init); -#else -static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); - -void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) -{ - __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); -} -EXPORT_SYMBOL_GPL(sock_prot_inuse_add); - -int sock_prot_inuse_get(struct net *net, struct proto *prot) -{ - int cpu, idx = prot->inuse_idx; - int res = 0; - - for_each_possible_cpu(cpu) - res += per_cpu(prot_inuse, cpu).val[idx]; - - return res >= 0 ? res : 0; -} -EXPORT_SYMBOL_GPL(sock_prot_inuse_get); -#endif static void assign_proto_idx(struct proto *prot) { -- cgit v1.2.3 From a42c8e33f2044d6b56f167b5506c7e09e5b702c2 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 22:29:51 +0100 Subject: net: dsa: Fix SWITCHDEV_ATTR_ID_PORT_PARENT_ID SWITCHDEV_ATTR_ID_PORT_PARENT_ID is used by the software bridge when determining which ports to flood a packet out. If the packet originated from a switch, it assumes the switch has already flooded the packet out the switches ports, so the bridge should not flood the packet itself out switch ports. Ports on the same switch are expected to return the same parent ID when SWITCHDEV_ATTR_ID_PORT_PARENT_ID is called. DSA gets this wrong with clusters of switches. As far as the software bridge is concerned, the cluster is all one switch. A packet from any switch in the cluster can be assumed to have been flooded as needed out of all ports of the cluster, not just the switch it originated from. Hence all ports of a cluster should return the same parent. The old implementation did not, each switch in the cluster had its own ID. Also wrong was that the ID was not unique if multiple DSA instances are in operation. Use the tree ID as the parent ID, which is the same for all switches in a cluster and unique across switch clusters. Signed-off-by: Andrew Lunn Reviewed-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 35715fda84b2..d6e7a642493b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -355,11 +355,12 @@ static int dsa_slave_port_attr_get(struct net_device *dev, { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst = ds->dst; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: - attr->u.ppid.id_len = sizeof(ds->index); - memcpy(&attr->u.ppid.id, &ds->index, attr->u.ppid.id_len); + attr->u.ppid.id_len = sizeof(dst->index); + memcpy(&attr->u.ppid.id, &dst->index, attr->u.ppid.id_len); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT: attr->u.brport_flags_support = 0; -- cgit v1.2.3 From 13edbdb6ed8b829ab3198a8c7b978c37184317b5 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 22:29:52 +0100 Subject: net: dsa: {e}dsa: set offload_fwd_mark on received packets The software bridge needs to know if a packet has already been bridged by hardware offload to ports in the same hardware offload, in order that it does not re-flood them, causing duplicates. This is particularly true for broadcast and multicast traffic which the host has requested. By setting offload_fwd_mark in the skb the bridge will only flood to ports in other offloads and other netifs. Set this flag in the DSA and EDSA tag driver. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/tag_dsa.c | 2 ++ net/dsa/tag_edsa.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index dbbcdafed8c3..cd13cfc542ce 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -141,6 +141,8 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } + skb->offload_fwd_mark = 1; + return skb; } diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index f38a626b3a05..4083326b806e 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -160,6 +160,8 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } + skb->offload_fwd_mark = 1; + return skb; } -- cgit v1.2.3 From 4672cd36053e4a8480f7ca796758d56ef23ccb78 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Fri, 10 Nov 2017 12:54:35 +0100 Subject: net: dsa: lan9303: Clear offload_fwd_mark for IGMP Now that IGMP packets no longer is flooded in HW, we want the SW bridge to forward packets based on bridge configuration. To make that happen, IGMP packets must have skb->offload_fwd_mark = 0. Signed-off-by: Egil Hjelmeland Signed-off-by: David S. Miller --- net/dsa/tag_lan9303.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 5ba01fc3c6ba..b8c5e52b2eff 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -92,6 +92,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, { u16 *lan9303_tag; unsigned int source_port; + u16 ether_type_nw; + u8 ip_protocol; if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) { dev_warn_ratelimited(&dev->dev, @@ -129,6 +131,17 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, skb->offload_fwd_mark = !ether_addr_equal(skb->data - ETH_HLEN, eth_stp_addr); + /* We also need IGMP packets to have skb->offload_fwd_mark = 0. + * Solving this for all conceivable situations would add more cost to + * every packet. Instead we handle just the common case: + * No VLAN tag + Ethernet II framing. + * Test least probable term first. + */ + ether_type_nw = lan9303_tag[2]; + ip_protocol = *(skb->data + 9); + if (ip_protocol == IPPROTO_IGMP && ether_type_nw == htons(ETH_P_IP)) + skb->offload_fwd_mark = 0; + return skb; } -- cgit v1.2.3 From 8fdfd6595bd739d8338fd1f3d553a9a34ed9824c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 11 Nov 2017 06:06:29 +0900 Subject: l2tp: remove .tunnel_sock from struct l2tp_eth This field has never been used. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 3e2dec1fb0f5..5c366ecfa1cb 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -41,7 +41,6 @@ /* via netdev_priv() */ struct l2tp_eth { - struct sock *tunnel_sock; struct l2tp_session *session; atomic_long_t tx_bytes; atomic_long_t tx_packets; @@ -313,7 +312,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, priv = netdev_priv(dev); priv->session = session; - priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; session->session_close = l2tp_eth_delete; #if IS_ENABLED(CONFIG_L2TP_DEBUGFS) -- cgit v1.2.3 From 7198c77aa05560c257ee377ec1f4796812121580 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 11 Nov 2017 06:06:31 +0900 Subject: l2tp: avoid using ->tunnel_sock for getting session's parent tunnel Sessions don't need to use l2tp_sock_to_tunnel(xxx->tunnel_sock) for accessing their parent tunnel. They have the .tunnel field in the l2tp_session structure for that. Furthermore, in all these cases, the session is registered, so we're guaranteed that .tunnel isn't NULL and that the session properly holds a reference on the tunnel. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 66 ++++++++++------------------------------------------- 1 file changed, 12 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 5f5c78b632d0..88b4cb1b7cde 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -295,7 +295,6 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, int error; struct l2tp_session *session; struct l2tp_tunnel *tunnel; - struct pppol2tp_session *ps; int uhlen; error = -ENOTCONN; @@ -308,10 +307,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, if (session == NULL) goto error; - ps = l2tp_session_priv(session); - tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); - if (tunnel == NULL) - goto error_put_sess; + tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; @@ -322,7 +318,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ 0, GFP_KERNEL); if (!skb) - goto error_put_sess_tun; + goto error_put_sess; /* Reserve space for headers. */ skb_reserve(skb, NET_SKB_PAD); @@ -340,20 +336,17 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, error = memcpy_from_msg(skb_put(skb, total_len), m, total_len); if (error < 0) { kfree_skb(skb); - goto error_put_sess_tun; + goto error_put_sess; } local_bh_disable(); l2tp_xmit_skb(session, skb, session->hdr_len); local_bh_enable(); - sock_put(ps->tunnel_sock); sock_put(sk); return total_len; -error_put_sess_tun: - sock_put(ps->tunnel_sock); error_put_sess: sock_put(sk); error: @@ -377,10 +370,8 @@ error: static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = (struct sock *) chan->private; - struct sock *sk_tun; struct l2tp_session *session; struct l2tp_tunnel *tunnel; - struct pppol2tp_session *ps; int uhlen, headroom; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) @@ -391,13 +382,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) if (session == NULL) goto abort; - ps = l2tp_session_priv(session); - sk_tun = ps->tunnel_sock; - if (sk_tun == NULL) - goto abort_put_sess; - tunnel = l2tp_sock_to_tunnel(sk_tun); - if (tunnel == NULL) - goto abort_put_sess; + tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; headroom = NET_SKB_PAD + @@ -406,7 +391,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) session->hdr_len + /* L2TP header */ 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ if (skb_cow_head(skb, headroom)) - goto abort_put_sess_tun; + goto abort_put_sess; /* Setup PPP header */ __skb_push(skb, 2); @@ -417,12 +402,10 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) l2tp_xmit_skb(session, skb, session->hdr_len); local_bh_enable(); - sock_put(sk_tun); sock_put(sk); + return 1; -abort_put_sess_tun: - sock_put(sk_tun); abort_put_sess: sock_put(sk); abort: @@ -919,9 +902,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, goto end; pls = l2tp_session_priv(session); - tunnel = l2tp_sock_to_tunnel(pls->tunnel_sock); - if (tunnel == NULL) - goto end_put_sess; + tunnel = session->tunnel; inet = inet_sk(tunnel->sock); if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) { @@ -1001,8 +982,6 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, *usockaddr_len = len; error = 0; - sock_put(pls->tunnel_sock); -end_put_sess: sock_put(sk); end: return error; @@ -1241,7 +1220,6 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; - struct pppol2tp_session *ps; int err; if (!sk) @@ -1265,16 +1243,10 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, /* Special case: if session's session_id is zero, treat ioctl as a * tunnel ioctl */ - ps = l2tp_session_priv(session); if ((session->session_id == 0) && (session->peer_session_id == 0)) { - err = -EBADF; - tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); - if (tunnel == NULL) - goto end_put_sess; - + tunnel = session->tunnel; err = pppol2tp_tunnel_ioctl(tunnel, cmd, arg); - sock_put(ps->tunnel_sock); goto end_put_sess; } @@ -1400,7 +1372,6 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; - struct pppol2tp_session *ps; int val; int err; @@ -1425,20 +1396,14 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, /* Special case: if session_id == 0x0000, treat as operation on tunnel */ - ps = l2tp_session_priv(session); if ((session->session_id == 0) && (session->peer_session_id == 0)) { - err = -EBADF; - tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); - if (tunnel == NULL) - goto end_put_sess; - + tunnel = session->tunnel; err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val); - sock_put(ps->tunnel_sock); - } else + } else { err = pppol2tp_session_setsockopt(sk, session, optname, val); + } -end_put_sess: sock_put(sk); end: return err; @@ -1526,7 +1491,6 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, struct l2tp_tunnel *tunnel; int val, len; int err; - struct pppol2tp_session *ps; if (level != SOL_PPPOL2TP) return -EINVAL; @@ -1550,16 +1514,10 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ - ps = l2tp_session_priv(session); if ((session->session_id == 0) && (session->peer_session_id == 0)) { - err = -EBADF; - tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); - if (tunnel == NULL) - goto end_put_sess; - + tunnel = session->tunnel; err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); - sock_put(ps->tunnel_sock); if (err) goto end_put_sess; } else { -- cgit v1.2.3 From da9ca825ef79fd1c64297b5267d594b11a9e5b16 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 11 Nov 2017 06:06:37 +0900 Subject: l2tp: remove the .tunnel_sock field from struct pppol2tp_session The last user of .tunnel_sock is pppol2tp_connect() which defensively uses it to verify internal data consistency. This check isn't necessary: l2tp_session_get() guarantees that the returned session belongs to the tunnel passed as parameter. And .tunnel_sock is never updated, so checking that it still points to the parent tunnel socket is useless; that test can never fail. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 88b4cb1b7cde..b412fc3351dc 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -127,8 +127,6 @@ struct pppol2tp_session { * PPPoX socket */ struct sock *__sk; /* Copy of .sk, for cleanup */ struct rcu_head rcu; /* For asynchronous release */ - struct sock *tunnel_sock; /* Pointer to the tunnel UDP - * socket */ int flags; /* accessed by PPPIOCGFLAGS. * Unused. */ }; @@ -592,7 +590,6 @@ static void pppol2tp_session_init(struct l2tp_session *session) ps = l2tp_session_priv(session); mutex_init(&ps->sk_lock); - ps->tunnel_sock = session->tunnel->sock; ps->owner = current->pid; /* If PMTU discovery was enabled, use the MTU that was discovered */ @@ -743,13 +740,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, error = -EEXIST; goto end; } - - /* consistency checks */ - if (ps->tunnel_sock != tunnel->sock) { - mutex_unlock(&ps->sk_lock); - error = -EEXIST; - goto end; - } } else { /* Default MTU must allow space for UDP/L2TP/PPP headers */ cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; -- cgit v1.2.3 From 39b175211053c7a6a4d794c42e225994f1c069c2 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 10 Nov 2017 14:03:51 -0800 Subject: net: Remove unused skb_shared_info member ip6_frag_id was only used by UFO, which has been removed. ipv6_proxy_select_ident() only existed to set ip6_frag_id and has no in-tree callers. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/ipv6/output_core.c | 31 ------------------------------- 1 file changed, 31 deletions(-) (limited to 'net') diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index a338bbc33cf3..4a7e5ffa5108 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -31,37 +31,6 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, return id; } -/* This function exists only for tap drivers that must support broken - * clients requesting UFO without specifying an IPv6 fragment ID. - * - * This is similar to ipv6_select_ident() but we use an independent hash - * seed to limit information leakage. - * - * The network header must be set before calling this. - */ -void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) -{ - static u32 ip6_proxy_idents_hashrnd __read_mostly; - struct in6_addr buf[2]; - struct in6_addr *addrs; - u32 id; - - addrs = skb_header_pointer(skb, - skb_network_offset(skb) + - offsetof(struct ipv6hdr, saddr), - sizeof(buf), buf); - if (!addrs) - return; - - net_get_random_once(&ip6_proxy_idents_hashrnd, - sizeof(ip6_proxy_idents_hashrnd)); - - id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, - &addrs[1], &addrs[0]); - skb_shinfo(skb)->ip6_frag_id = htonl(id); -} -EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); - __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) -- cgit v1.2.3 From 338d182fa542b3ca05456ad1ce9cebe6580083b1 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Wed, 8 Nov 2017 11:23:46 -0800 Subject: ipv6: try not to take rtnl_lock in ip6mr_sk_done Avoid traversing the list of mr6_tables (which requires the rtnl_lock) in ip6mr_sk_done(), when we know in advance that a match will not be found. This can happen when rawv6_close()/ip6mr_sk_done() is invoked on non-mroute6 sockets. This patch helps reduce rtnl_lock contention when destroying a large number of net namespaces, each having a non-mroute6 raw socket. v2: same patch, only fixed subject line and expanded comment. Signed-off-by: Francesco Ruggeri Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 59fad81e5f7a..9c24b85949c1 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1617,6 +1617,10 @@ int ip6mr_sk_done(struct sock *sk) struct net *net = sock_net(sk); struct mr6_table *mrt; + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return err; + rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == mrt->mroute6_sk) { -- cgit v1.2.3 From 112f9cb65643caf7b922e1a66dc752bfab40aeb1 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Wed, 8 Nov 2017 15:12:26 -0800 Subject: netem: convert to qdisc_watchdog_schedule_ns Upgrade the internal netem scheduler to use nanoseconds rather than ticks throughout. Convert to and from the std "ticks" userspace api automatically, while allowing for finer grained scheduling to take place. Signed-off-by: Dave Taht Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 56 +++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index db0228a65e8c..e64e0e0d94ff 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -77,8 +77,8 @@ struct netem_sched_data { struct qdisc_watchdog watchdog; - psched_tdiff_t latency; - psched_tdiff_t jitter; + s64 latency; + s64 jitter; u32 loss; u32 ecn; @@ -145,7 +145,7 @@ struct netem_sched_data { * we save skb->tstamp value in skb->cb[] before destroying it. */ struct netem_skb_cb { - psched_time_t time_to_send; + u64 time_to_send; }; static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) @@ -305,11 +305,11 @@ static bool loss_event(struct netem_sched_data *q) * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. */ -static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, - struct crndstate *state, - const struct disttable *dist) +static s64 tabledist(s64 mu, s64 sigma, + struct crndstate *state, + const struct disttable *dist) { - psched_tdiff_t x; + s64 x; long t; u32 rnd; @@ -332,10 +332,10 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } -static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q) +static u64 packet_len_2_sched_time(unsigned int len, + struct netem_sched_data *q) { - u64 ticks; - + u64 offset; len += q->packet_overhead; if (q->cell_size) { @@ -345,11 +345,9 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche cells++; len = cells * (q->cell_size + q->cell_overhead); } - - ticks = (u64)len * NSEC_PER_SEC; - - do_div(ticks, q->rate); - return PSCHED_NS2TICKS(ticks); + offset = (u64)len * NSEC_PER_SEC; + do_div(offset, q->rate); + return offset; } static void tfifo_reset(struct Qdisc *sch) @@ -369,7 +367,7 @@ static void tfifo_reset(struct Qdisc *sch) static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; + u64 tnext = netem_skb_cb(nskb)->time_to_send; struct rb_node **p = &q->t_root.rb_node, *parent = NULL; while (*p) { @@ -515,13 +513,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (q->gap == 0 || /* not doing reordering */ q->counter < q->gap - 1 || /* inside last reordering gap */ q->reorder < get_crandom(&q->reorder_cor)) { - psched_time_t now; - psched_tdiff_t delay; + u64 now; + s64 delay; delay = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); - now = psched_get_time(); + now = ktime_get_ns(); if (q->rate) { struct netem_skb_cb *last = NULL; @@ -547,7 +545,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * from delay. */ delay -= last->time_to_send - now; - delay = max_t(psched_tdiff_t, 0, delay); + delay = max_t(s64, 0, delay); now = last->time_to_send; } @@ -562,7 +560,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * Do re-ordering by putting one out of N packets at the front * of the queue. */ - cb->time_to_send = psched_get_time(); + cb->time_to_send = ktime_get_ns(); q->counter = 0; netem_enqueue_skb_head(&sch->q, skb); @@ -609,13 +607,13 @@ deliver: } p = rb_first(&q->t_root); if (p) { - psched_time_t time_to_send; + u64 time_to_send; skb = rb_to_skb(p); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; - if (time_to_send <= psched_get_time()) { + if (time_to_send <= ktime_get_ns()) { rb_erase(p, &q->t_root); sch->q.qlen--; @@ -659,7 +657,7 @@ deliver: if (skb) goto deliver; } - qdisc_watchdog_schedule(&q->watchdog, time_to_send); + qdisc_watchdog_schedule_ns(&q->watchdog, time_to_send); } if (q->qdisc) { @@ -888,8 +886,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) sch->limit = qopt->limit; - q->latency = qopt->latency; - q->jitter = qopt->jitter; + q->latency = PSCHED_TICKS2NS(qopt->latency); + q->jitter = PSCHED_TICKS2NS(qopt->jitter); q->limit = qopt->limit; q->gap = qopt->gap; q->counter = 0; @@ -1011,8 +1009,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_corrupt corrupt; struct tc_netem_rate rate; - qopt.latency = q->latency; - qopt.jitter = q->jitter; + qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency), + UINT_MAX); + qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter), + UINT_MAX); qopt.limit = q->limit; qopt.loss = q->loss; qopt.gap = q->gap; -- cgit v1.2.3 From 99803171ef04037092bf5eb29ae801e8b4d49a75 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Wed, 8 Nov 2017 15:12:27 -0800 Subject: netem: add uapi to express delay and jitter in nanoseconds netem userspace has long relied on a horrible /proc/net/psched hack to translate the current notion of "ticks" to nanoseconds. Expressing latency and jitter instead, in well defined nanoseconds, increases the dynamic range of emulated delays and jitter in netem. It will also ease a transition where reducing a tick to nsec equivalence would constrain the max delay in prior versions of netem to only 4.3 seconds. Signed-off-by: Dave Taht Suggested-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e64e0e0d94ff..47d6decba0ea 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -819,6 +819,8 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, [TCA_NETEM_ECN] = { .type = NLA_U32 }, [TCA_NETEM_RATE64] = { .type = NLA_U64 }, + [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, + [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -916,6 +918,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) q->rate = max_t(u64, q->rate, nla_get_u64(tb[TCA_NETEM_RATE64])); + if (tb[TCA_NETEM_LATENCY64]) + q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); + + if (tb[TCA_NETEM_JITTER64]) + q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); + if (tb[TCA_NETEM_ECN]) q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); @@ -1020,6 +1028,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; + if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) + goto nla_put_failure; + + if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) + goto nla_put_failure; + cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; -- cgit v1.2.3 From 836af83b54e3e285c4a0cc06c24aeb737d3e0e18 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Wed, 8 Nov 2017 15:12:28 -0800 Subject: netem: support delivering packets in delayed time slots Slotting is a crude approximation of the behaviors of shared media such as cable, wifi, and LTE, which gather up a bunch of packets within a varying delay window and deliver them, relative to that, nearly all at once. It works within the existing loss, duplication, jitter and delay parameters of netem. Some amount of inherent latency must be specified, regardless. The new "slot" parameter specifies a minimum and maximum delay between transmission attempts. The "bytes" and "packets" parameters can be used to limit the amount of information transferred per slot. Examples of use: tc qdisc add dev eth0 root netem delay 200us \ slot 800us 10ms bytes 64k packets 42 A more correct example, using stacked netem instances and a packet limit to emulate a tail drop wifi queue with slots and variable packet delivery, with a 200Mbit isochronous underlying rate, and 20ms path delay: tc qdisc add dev eth0 root handle 1: netem delay 20ms rate 200mbit \ limit 10000 tc qdisc add dev eth0 parent 1:1 handle 10:1 netem delay 200us \ slot 800us 10ms bytes 64k packets 42 limit 512 Signed-off-by: Dave Taht Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 47d6decba0ea..b686e755fda9 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -135,6 +135,13 @@ struct netem_sched_data { u32 a5; /* p23 used only in 4-states */ } clg; + struct tc_netem_slot slot_config; + struct slotstate { + u64 slot_next; + s32 packets_left; + s32 bytes_left; + } slot; + }; /* Time stamp put into socket buffer control block @@ -591,6 +598,20 @@ finish_segs: return NET_XMIT_SUCCESS; } +/* Delay the next round with a new future slot with a + * correct number of bytes and packets. + */ + +static void get_slot_next(struct netem_sched_data *q, u64 now) +{ + q->slot.slot_next = now + q->slot_config.min_delay + + (prandom_u32() * + (q->slot_config.max_delay - + q->slot_config.min_delay) >> 32); + q->slot.packets_left = q->slot_config.max_packets; + q->slot.bytes_left = q->slot_config.max_bytes; +} + static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); @@ -608,14 +629,17 @@ deliver: p = rb_first(&q->t_root); if (p) { u64 time_to_send; + u64 now = ktime_get_ns(); skb = rb_to_skb(p); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; - if (time_to_send <= ktime_get_ns()) { - rb_erase(p, &q->t_root); + if (q->slot.slot_next && q->slot.slot_next < time_to_send) + get_slot_next(q, now); + if (time_to_send <= now && q->slot.slot_next <= now) { + rb_erase(p, &q->t_root); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; @@ -634,6 +658,14 @@ deliver: skb->tstamp = 0; #endif + if (q->slot.slot_next) { + q->slot.packets_left--; + q->slot.bytes_left -= qdisc_pkt_len(skb); + if (q->slot.packets_left <= 0 || + q->slot.bytes_left <= 0) + get_slot_next(q, now); + } + if (q->qdisc) { unsigned int pkt_len = qdisc_pkt_len(skb); struct sk_buff *to_free = NULL; @@ -657,7 +689,10 @@ deliver: if (skb) goto deliver; } - qdisc_watchdog_schedule_ns(&q->watchdog, time_to_send); + + qdisc_watchdog_schedule_ns(&q->watchdog, + max(time_to_send, + q->slot.slot_next)); } if (q->qdisc) { @@ -688,6 +723,7 @@ static void dist_free(struct disttable *d) * Distribution data is a variable size payload containing * signed 16 bit values. */ + static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); @@ -718,6 +754,23 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) return 0; } +static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) +{ + const struct tc_netem_slot *c = nla_data(attr); + + q->slot_config = *c; + if (q->slot_config.max_packets == 0) + q->slot_config.max_packets = INT_MAX; + if (q->slot_config.max_bytes == 0) + q->slot_config.max_bytes = INT_MAX; + q->slot.packets_left = q->slot_config.max_packets; + q->slot.bytes_left = q->slot_config.max_bytes; + if (q->slot_config.min_delay | q->slot_config.max_delay) + q->slot.slot_next = ktime_get_ns(); + else + q->slot.slot_next = 0; +} + static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_corr *c = nla_data(attr); @@ -821,6 +874,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_RATE64] = { .type = NLA_U64 }, [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, + [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -927,6 +981,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_NETEM_ECN]) q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); + if (tb[TCA_NETEM_SLOT]) + get_slot(q, tb[TCA_NETEM_SLOT]); + return ret; } @@ -1016,6 +1073,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; struct tc_netem_rate rate; + struct tc_netem_slot slot; qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency), UINT_MAX); @@ -1070,6 +1128,16 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (dump_loss_model(q, skb) != 0) goto nla_put_failure; + if (q->slot_config.min_delay | q->slot_config.max_delay) { + slot = q->slot_config; + if (slot.max_packets == INT_MAX) + slot.max_packets = 0; + if (slot.max_bytes == INT_MAX) + slot.max_bytes = 0; + if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) + goto nla_put_failure; + } + return nla_nest_end(skb, nla); nla_put_failure: -- cgit v1.2.3 From 0642840b8bb008528dbdf929cec9f65ac4231ad0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 9 Nov 2017 13:04:44 +0900 Subject: af_netlink: ensure that NLMSG_DONE never fails in dumps The way people generally use netlink_dump is that they fill in the skb as much as possible, breaking when nla_put returns an error. Then, they get called again and start filling out the next skb, and again, and so forth. The mechanism at work here is the ability for the iterative dumping function to detect when the skb is filled up and not fill it past the brim, waiting for a fresh skb for the rest of the data. However, if the attributes are small and nicely packed, it is possible that a dump callback function successfully fills in attributes until the skb is of size 4080 (libmnl's default page-sized receive buffer size). The dump function completes, satisfied, and then, if it happens to be that this is actually the last skb, and no further ones are to be sent, then netlink_dump will add on the NLMSG_DONE part: nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI); It is very important that netlink_dump does this, of course. However, in this example, that call to nlmsg_put_answer will fail, because the previous filling by the dump function did not leave it enough room. And how could it possibly have done so? All of the nla_put variety of functions simply check to see if the skb has enough tailroom, independent of the context it is in. In order to keep the important assumptions of all netlink dump users, it is therefore important to give them an skb that has this end part of the tail already reserved, so that the call to nlmsg_put_answer does not fail. Otherwise, library authors are forced to find some bizarre sized receive buffer that has a large modulo relative to the common sizes of messages received, which is ugly and buggy. This patch thus saves the NLMSG_DONE for an additional message, for the case that things are dangerously close to the brim. This requires keeping track of the errno from ->dump() across calls. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 17 +++++++++++------ net/netlink/af_netlink.h | 1 + 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 26ded4239611..a3eab903a9cd 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2136,7 +2136,7 @@ static int netlink_dump(struct sock *sk) struct sk_buff *skb = NULL; struct nlmsghdr *nlh; struct module *module; - int len, err = -ENOBUFS; + int err = -ENOBUFS; int alloc_min_size; int alloc_size; @@ -2183,9 +2183,11 @@ static int netlink_dump(struct sock *sk) skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); - len = cb->dump(skb, cb); + if (nlk->dump_done_errno > 0) + nlk->dump_done_errno = cb->dump(skb, cb); - if (len > 0) { + if (nlk->dump_done_errno > 0 || + skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { mutex_unlock(nlk->cb_mutex); if (sk_filter(sk, skb)) @@ -2195,13 +2197,15 @@ static int netlink_dump(struct sock *sk) return 0; } - nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI); - if (!nlh) + nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, + sizeof(nlk->dump_done_errno), NLM_F_MULTI); + if (WARN_ON(!nlh)) goto errout_skb; nl_dump_check_consistent(cb, nlh); - memcpy(nlmsg_data(nlh), &len, sizeof(len)); + memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, + sizeof(nlk->dump_done_errno)); if (sk_filter(sk, skb)) kfree_skb(skb); @@ -2273,6 +2277,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, } nlk->cb_running = true; + nlk->dump_done_errno = INT_MAX; mutex_unlock(nlk->cb_mutex); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 028188597eaa..962de7b3c023 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -34,6 +34,7 @@ struct netlink_sock { wait_queue_head_t wait; bool bound; bool cb_running; + int dump_done_errno; struct netlink_callback cb; struct mutex *cb_mutex; struct mutex cb_def_mutex; -- cgit v1.2.3 From ee9d3429c0e47a57e3e73b638785cafa33773639 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 10 Nov 2017 15:09:53 -0800 Subject: net/sched/sch_red.c: work around gcc-4.4.4 anon union initializer issue gcc-4.4.4 (at lest) has issues with initializers and anonymous unions: net/sched/sch_red.c: In function 'red_dump_offload': net/sched/sch_red.c:282: error: unknown field 'stats' specified in initializer net/sched/sch_red.c:282: warning: initialization makes integer from pointer without a cast net/sched/sch_red.c:283: error: unknown field 'stats' specified in initializer net/sched/sch_red.c:283: warning: initialization makes integer from pointer without a cast net/sched/sch_red.c: In function 'red_dump_stats': net/sched/sch_red.c:352: error: unknown field 'xstats' specified in initializer net/sched/sch_red.c:352: warning: initialization makes integer from pointer without a cast Work around this. Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc") Cc: Nogah Frankel Cc: Jiri Pirko Cc: Simon Horman Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/sched/sch_red.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 007dd8ef8aac..7f8ea9e297c3 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -276,11 +276,13 @@ static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) { struct net_device *dev = qdisc_dev(sch); struct tc_red_qopt_offload hw_stats = { + .command = TC_RED_STATS, .handle = sch->handle, .parent = sch->parent, - .command = TC_RED_STATS, - .stats.bstats = &sch->bstats, - .stats.qstats = &sch->qstats, + { + .stats.bstats = &sch->bstats, + .stats.qstats = &sch->qstats, + }, }; int err; @@ -346,10 +348,12 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) { struct red_stats hw_stats = {0}; struct tc_red_qopt_offload hw_stats_request = { + .command = TC_RED_XSTATS, .handle = sch->handle, .parent = sch->parent, - .command = TC_RED_XSTATS, - .xstats = &hw_stats, + { + .xstats = &hw_stats, + }, }; if (!dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, -- cgit v1.2.3 From 5ed4e3eb021762fee584ce65620bc822131c7aa0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 10 Nov 2017 15:22:52 -0800 Subject: net: dsa: Pass a port to get_tag_protocol() A number of drivers want to check whether the configured CPU port is a possible configuration for enabling tagging, pass down the CPU port number so they verify that. Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 2 +- net/dsa/legacy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index fd54a8e17986..44e3fb7dec8c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -539,7 +539,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; - tag_protocol = ds->ops->get_tag_protocol(ds); + tag_protocol = ds->ops->get_tag_protocol(ds, dp->index); tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 4863e3e398b6..84611d7fcfa2 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -151,7 +151,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; - tag_protocol = ops->get_tag_protocol(ds); + tag_protocol = ops->get_tag_protocol(ds, dst->cpu_dp->index); tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(tag_ops)) return PTR_ERR(tag_ops); -- cgit v1.2.3 From f7c39e3d1e094af1c3e0676b76c00d8c1f8e7774 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 10 Nov 2017 15:22:53 -0800 Subject: net: dsa: tag_brcm: Prepare for supporting prepended tag In preparation for supporting the same Broadcom tag format, but instead of inserted between the MAC SA and EtherType, prepended to the Ethernet frame, restructure the code a little bit to make that possible and take an offset parameter. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/tag_brcm.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 9e082bae3cb0..771409a1e65c 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -59,7 +59,9 @@ #define BRCM_EG_TC_MASK 0x7 #define BRCM_EG_PID_MASK 0x1f -static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, + struct net_device *dev, + unsigned int offset) { struct dsa_port *dp = dsa_slave_to_port(dev); u16 queue = skb_get_queue_mapping(skb); @@ -70,10 +72,10 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev skb_push(skb, BRCM_TAG_LEN); - memmove(skb->data, skb->data + BRCM_TAG_LEN, 2 * ETH_ALEN); + if (offset) + memmove(skb->data, skb->data + BRCM_TAG_LEN, offset); - /* Build the tag after the MAC Source Address */ - brcm_tag = skb->data + 2 * ETH_ALEN; + brcm_tag = skb->data + offset; /* Set the ingress opcode, traffic class, tag enforcment is * deprecated @@ -94,8 +96,17 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev return skb; } -static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Build the tag after the MAC Source Address */ + return brcm_tag_xmit_ll(skb, dev, 2 * ETH_ALEN); +} + +static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt, + unsigned int offset) { int source_port; u8 *brcm_tag; @@ -103,8 +114,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) return NULL; - /* skb->data points to the EtherType, the tag is right before it */ - brcm_tag = skb->data - 2; + brcm_tag = skb->data - offset; /* The opcode should never be different than 0b000 */ if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK)) @@ -126,12 +136,25 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); + return skb; +} + +static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + struct sk_buff *nskb; + + /* skb->data points to the EtherType, the tag is right before it */ + nskb = brcm_tag_rcv_ll(skb, dev, pt, 2); + if (!nskb) + return nskb; + /* Move the Ethernet DA and SA */ - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - BRCM_TAG_LEN, + memmove(nskb->data - ETH_HLEN, + nskb->data - ETH_HLEN - BRCM_TAG_LEN, 2 * ETH_ALEN); - return skb; + return nskb; } const struct dsa_device_ops brcm_netdev_ops = { -- cgit v1.2.3 From b74b70c44986dee87881fbed3d912e02c5dcf78c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 10 Nov 2017 15:22:54 -0800 Subject: net: dsa: Support prepended Broadcom tag Add a new type: DSA_TAG_PROTO_PREPEND which allows us to support for the 4-bytes Broadcom tag that we already support, but in a format where it is pre-pended to the packet instead of located between the MAC SA and the Ethertyper (DSA_TAG_PROTO_BRCM). Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/Kconfig | 3 +++ net/dsa/Makefile | 1 + net/dsa/dsa.c | 3 +++ net/dsa/dsa_priv.h | 1 + net/dsa/tag_brcm.c | 39 ++++++++++++++++++++++++++++++++------- 5 files changed, 40 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index cc5f8f971689..2fed892094bc 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -19,6 +19,9 @@ if NET_DSA config NET_DSA_TAG_BRCM bool +config NET_DSA_TAG_BRCM_PREPEND + bool + config NET_DSA_TAG_DSA bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index e9a4a0f33e86..0e13c1f95d13 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -5,6 +5,7 @@ dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o +dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index b8f2d9f7c3ed..6a9d0f50fbee 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -44,6 +44,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #ifdef CONFIG_NET_DSA_TAG_BRCM [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, #endif +#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND + [DSA_TAG_PROTO_BRCM_PREPEND] = &brcm_prepend_netdev_ops, +#endif #ifdef CONFIG_NET_DSA_TAG_DSA [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops, #endif diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 507e1ce4d4d2..7d036696e8c4 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -191,6 +191,7 @@ void dsa_switch_unregister_notifier(struct dsa_switch *ds); /* tag_brcm.c */ extern const struct dsa_device_ops brcm_netdev_ops; +extern const struct dsa_device_ops brcm_prepend_netdev_ops; /* tag_dsa.c */ extern const struct dsa_device_ops dsa_netdev_ops; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 771409a1e65c..e6e0b7b6025c 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -96,13 +96,6 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, return skb; } -static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - /* Build the tag after the MAC Source Address */ - return brcm_tag_xmit_ll(skb, dev, 2 * ETH_ALEN); -} - static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, @@ -139,6 +132,15 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, return skb; } +#ifdef CONFIG_NET_DSA_TAG_BRCM +static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Build the tag after the MAC Source Address */ + return brcm_tag_xmit_ll(skb, dev, 2 * ETH_ALEN); +} + + static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { @@ -161,3 +163,26 @@ const struct dsa_device_ops brcm_netdev_ops = { .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, }; +#endif + +#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND +static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb, + struct net_device *dev) +{ + /* tag is prepended to the packet */ + return brcm_tag_xmit_ll(skb, dev, 0); +} + +static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + /* tag is prepended to the packet */ + return brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN); +} + +const struct dsa_device_ops brcm_prepend_netdev_ops = { + .xmit = brcm_tag_xmit_prepend, + .rcv = brcm_tag_rcv_prepend, +}; +#endif -- cgit v1.2.3 From 9602c01e57f7b868d748c2ba2aef0efa64b71ffc Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 10 Nov 2017 12:09:41 -0800 Subject: openvswitch: export get_dp() API. Later patches will invoke get_dp() outside of datapath.c. Export it. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 29 ----------------------------- net/openvswitch/datapath.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 4d38ac044cee..6e098035bb8f 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -142,35 +142,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *, uint32_t cutlen); -/* Must be called with rcu_read_lock. */ -static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) -{ - struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); - - if (dev) { - struct vport *vport = ovs_internal_dev_get_vport(dev); - if (vport) - return vport->dp; - } - - return NULL; -} - -/* The caller must hold either ovs_mutex or rcu_read_lock to keep the - * returned dp pointer valid. - */ -static inline struct datapath *get_dp(struct net *net, int dp_ifindex) -{ - struct datapath *dp; - - WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); - rcu_read_lock(); - dp = get_dp_rcu(net, dp_ifindex); - rcu_read_unlock(); - - return dp; -} - /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 4a104ef9e12c..954c4ed465a5 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -30,6 +30,7 @@ #include "conntrack.h" #include "flow.h" #include "flow_table.h" +#include "vport-internal_dev.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 @@ -190,6 +191,36 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n return ovs_lookup_vport(dp, port_no); } +/* Must be called with rcu_read_lock. */ +static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) +{ + struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); + + if (dev) { + struct vport *vport = ovs_internal_dev_get_vport(dev); + + if (vport) + return vport->dp; + } + + return NULL; +} + +/* The caller must hold either ovs_mutex or rcu_read_lock to keep the + * returned dp pointer valid. + */ +static inline struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + rcu_read_lock(); + dp = get_dp_rcu(net, dp_ifindex); + rcu_read_unlock(); + + return dp; +} + extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; -- cgit v1.2.3 From 96fbc13d7e770b542d2d1fcf700d0baadc6e8063 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 10 Nov 2017 12:09:42 -0800 Subject: openvswitch: Add meter infrastructure OVS kernel datapath so far does not support Openflow meter action. This is the first stab at adding kernel datapath meter support. This implementation supports only drop band type. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/openvswitch/Makefile | 1 + net/openvswitch/datapath.c | 14 +- net/openvswitch/datapath.h | 3 + net/openvswitch/meter.c | 604 +++++++++++++++++++++++++++++++++++++++++++++ net/openvswitch/meter.h | 54 ++++ 5 files changed, 674 insertions(+), 2 deletions(-) create mode 100644 net/openvswitch/meter.c create mode 100644 net/openvswitch/meter.h (limited to 'net') diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 299f4476cf44..41109c326f3a 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -12,6 +12,7 @@ openvswitch-y := \ flow.o \ flow_netlink.o \ flow_table.o \ + meter.o \ vport.o \ vport-internal_dev.o \ vport-netdev.o diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6e098035bb8f..0dab33fb9844 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -55,6 +55,7 @@ #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" +#include "meter.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -174,6 +175,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); kfree(dp->ports); + ovs_meters_exit(dp); kfree(dp); } @@ -1572,6 +1574,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) INIT_HLIST_HEAD(&dp->ports[i]); + err = ovs_meters_init(dp); + if (err) + goto err_destroy_ports_array; + /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); parms.type = OVS_VPORT_TYPE_INTERNAL; @@ -1600,7 +1606,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } - goto err_destroy_ports_array; + goto err_destroy_meters; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1615,8 +1621,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -err_destroy_ports_array: +err_destroy_meters: ovs_unlock(); + ovs_meters_exit(dp); +err_destroy_ports_array: kfree(dp->ports); err_destroy_percpu: free_percpu(dp->stats_percpu); @@ -2265,6 +2273,7 @@ static struct genl_family * const dp_genl_families[] = { &dp_vport_genl_family, &dp_flow_genl_family, &dp_packet_genl_family, + &dp_meter_genl_family, }; static void dp_unregister_genl(int n_families) @@ -2445,3 +2454,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 954c4ed465a5..5d2997b42460 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -92,6 +92,9 @@ struct datapath { u32 user_features; u32 max_headroom; + + /* Switch meters. */ + struct hlist_head *meters; }; /** diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c new file mode 100644 index 000000000000..2a5ba356c472 --- /dev/null +++ b/net/openvswitch/meter.c @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2017 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "datapath.h" +#include "meter.h" + +#define METER_HASH_BUCKETS 1024 + +static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { + [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, + [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, + [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, + [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, + [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, + [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { + [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, +}; + +static void rcu_free_ovs_meter_callback(struct rcu_head *rcu) +{ + struct dp_meter *meter = container_of(rcu, struct dp_meter, rcu); + + kfree(meter); +} + +static void ovs_meter_free(struct dp_meter *meter) +{ + if (!meter) + return; + + call_rcu(&meter->rcu, rcu_free_ovs_meter_callback); +} + +static struct hlist_head *meter_hash_bucket(const struct datapath *dp, + u32 meter_id) +{ + return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; +} + +/* Call with ovs_mutex or RCU read lock. */ +static struct dp_meter *lookup_meter(const struct datapath *dp, + u32 meter_id) +{ + struct dp_meter *meter; + struct hlist_head *head; + + head = meter_hash_bucket(dp, meter_id); + hlist_for_each_entry_rcu(meter, head, dp_hash_node) { + if (meter->id == meter_id) + return meter; + } + return NULL; +} + +static void attach_meter(struct datapath *dp, struct dp_meter *meter) +{ + struct hlist_head *head = meter_hash_bucket(dp, meter->id); + + hlist_add_head_rcu(&meter->dp_hash_node, head); +} + +static void detach_meter(struct dp_meter *meter) +{ + ASSERT_OVSL(); + if (meter) + hlist_del_rcu(&meter->dp_hash_node); +} + +static struct sk_buff * +ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, + struct ovs_header **ovs_reply_header) +{ + struct sk_buff *skb; + struct ovs_header *ovs_header = info->userhdr; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + *ovs_reply_header = genlmsg_put(skb, info->snd_portid, + info->snd_seq, + &dp_meter_genl_family, 0, cmd); + if (!ovs_reply_header) { + nlmsg_free(skb); + return ERR_PTR(-EMSGSIZE); + } + (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; + + return skb; +} + +static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, + struct dp_meter *meter) +{ + struct nlattr *nla; + struct dp_meter_band *band; + u16 i; + + if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) + goto error; + + if (!meter) + return 0; + + if (nla_put(reply, OVS_METER_ATTR_STATS, + sizeof(struct ovs_flow_stats), &meter->stats) || + nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, + OVS_METER_ATTR_PAD)) + goto error; + + nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto error; + + band = meter->bands; + + for (i = 0; i < meter->n_bands; ++i, ++band) { + struct nlattr *band_nla; + + band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, + sizeof(struct ovs_flow_stats), + &band->stats)) + goto error; + nla_nest_end(reply, band_nla); + } + nla_nest_end(reply, nla); + + return 0; +error: + return -EMSGSIZE; +} + +static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct nlattr *nla, *band_nla; + int err; + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, + &ovs_reply_header); + if (!reply) + return PTR_ERR(reply); + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) || + nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) + goto nla_put_failure; + + nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto nla_put_failure; + + band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla) + goto nla_put_failure; + /* Currently only DROP band type is supported. */ + if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) + goto nla_put_failure; + nla_nest_end(reply, band_nla); + nla_nest_end(reply, nla); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +nla_put_failure: + nlmsg_free(reply); + err = -EMSGSIZE; + return err; +} + +static struct dp_meter *dp_meter_create(struct nlattr **a) +{ + struct nlattr *nla; + int rem; + u16 n_bands = 0; + struct dp_meter *meter; + struct dp_meter_band *band; + int err; + + /* Validate attributes, count the bands. */ + if (!a[OVS_METER_ATTR_BANDS]) + return ERR_PTR(-EINVAL); + + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) + if (++n_bands > DP_MAX_BANDS) + return ERR_PTR(-EINVAL); + + /* Allocate and set up the meter before locking anything. */ + meter = kzalloc(n_bands * sizeof(struct dp_meter_band) + + sizeof(*meter), GFP_KERNEL); + if (!meter) + return ERR_PTR(-ENOMEM); + + meter->used = div_u64(ktime_get_ns(), 1000 * 1000); + meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; + meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; + spin_lock_init(&meter->lock); + if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { + meter->stats = *(struct ovs_flow_stats *) + nla_data(a[OVS_METER_ATTR_STATS]); + } + meter->n_bands = n_bands; + + /* Set up meter bands. */ + band = meter->bands; + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { + struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; + u32 band_max_delta_t; + + err = nla_parse((struct nlattr **)&attr, OVS_BAND_ATTR_MAX, + nla_data(nla), nla_len(nla), band_policy, + NULL); + if (err) + goto exit_free_meter; + + if (!attr[OVS_BAND_ATTR_TYPE] || + !attr[OVS_BAND_ATTR_RATE] || + !attr[OVS_BAND_ATTR_BURST]) { + err = -EINVAL; + goto exit_free_meter; + } + + band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); + band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); + band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); + /* Figure out max delta_t that is enough to fill any bucket. + * Keep max_delta_t size to the bucket units: + * pkts => 1/1000 packets, kilobits => bits. + */ + band_max_delta_t = (band->burst_size + band->rate) * 1000; + /* Start with a full bucket. */ + band->bucket = band_max_delta_t; + if (band_max_delta_t > meter->max_delta_t) + meter->max_delta_t = band_max_delta_t; + band++; + } + + return meter; + +exit_free_meter: + kfree(meter); + return ERR_PTR(err); +} + +static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct dp_meter *meter, *old_meter; + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct ovs_header *ovs_header = info->userhdr; + struct datapath *dp; + int err; + u32 meter_id; + bool failed; + + meter = dp_meter_create(a); + if (IS_ERR_OR_NULL(meter)) + return PTR_ERR(meter); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, + &ovs_reply_header); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto exit_free_meter; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + if (!a[OVS_METER_ATTR_ID]) { + err = -ENODEV; + goto exit_unlock; + } + + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + /* Cannot fail after this. */ + old_meter = lookup_meter(dp, meter_id); + detach_meter(old_meter); + attach_meter(dp, meter); + ovs_unlock(); + + /* Build response with the meter_id and stats from + * the old meter, if any. + */ + failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); + WARN_ON(failed); + if (old_meter) { + spin_lock_bh(&old_meter->lock); + if (old_meter->keep_stats) { + err = ovs_meter_cmd_reply_stats(reply, meter_id, + old_meter); + WARN_ON(err); + } + spin_unlock_bh(&old_meter->lock); + ovs_meter_free(old_meter); + } + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); +exit_free_meter: + kfree(meter); + return err; +} + +static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + u32 meter_id; + struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_reply_header; + struct datapath *dp; + int err; + struct sk_buff *reply; + struct dp_meter *meter; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + /* Locate meter, copy stats. */ + meter = lookup_meter(dp, meter_id); + if (!meter) { + err = -ENOENT; + goto exit_unlock; + } + + spin_lock_bh(&meter->lock); + err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); + spin_unlock_bh(&meter->lock); + if (err) + goto exit_unlock; + + ovs_unlock(); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + u32 meter_id; + struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_reply_header; + struct datapath *dp; + int err; + struct sk_buff *reply; + struct dp_meter *old_meter; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + old_meter = lookup_meter(dp, meter_id); + if (old_meter) { + spin_lock_bh(&old_meter->lock); + err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); + WARN_ON(err); + spin_unlock_bh(&old_meter->lock); + detach_meter(old_meter); + } + ovs_unlock(); + ovs_meter_free(old_meter); + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +/* Meter action execution. + * + * Return true 'meter_id' drop band is triggered. The 'skb' should be + * dropped by the caller'. + */ +bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 meter_id) +{ + struct dp_meter *meter; + struct dp_meter_band *band; + long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); + long long int long_delta_ms; + u32 delta_ms; + u32 cost; + int i, band_exceeded_max = -1; + u32 band_exceeded_rate = 0; + + meter = lookup_meter(dp, meter_id); + /* Do not drop the packet when there is no meter. */ + if (!meter) + return false; + + /* Lock the meter while using it. */ + spin_lock(&meter->lock); + + long_delta_ms = (now_ms - meter->used); /* ms */ + + /* Make sure delta_ms will not be too large, so that bucket will not + * wrap around below. + */ + delta_ms = (long_delta_ms > (long long int)meter->max_delta_t) + ? meter->max_delta_t : (u32)long_delta_ms; + + /* Update meter statistics. + */ + meter->used = now_ms; + meter->stats.n_packets += 1; + meter->stats.n_bytes += skb->len; + + /* Bucket rate is either in kilobits per second, or in packets per + * second. We maintain the bucket in the units of either bits or + * 1/1000th of a packet, correspondingly. + * Then, when rate is multiplied with milliseconds, we get the + * bucket units: + * msec * kbps = bits, and + * msec * packets/sec = 1/1000 packets. + * + * 'cost' is the number of bucket units in this packet. + */ + cost = (meter->kbps) ? skb->len * 8 : 1000; + + /* Update all bands and find the one hit with the highest rate. */ + for (i = 0; i < meter->n_bands; ++i) { + long long int max_bucket_size; + + band = &meter->bands[i]; + max_bucket_size = (band->burst_size + band->rate) * 1000; + + band->bucket += delta_ms * band->rate; + if (band->bucket > max_bucket_size) + band->bucket = max_bucket_size; + + if (band->bucket >= cost) { + band->bucket -= cost; + } else if (band->rate > band_exceeded_rate) { + band_exceeded_rate = band->rate; + band_exceeded_max = i; + } + } + + if (band_exceeded_max >= 0) { + /* Update band statistics. */ + band = &meter->bands[band_exceeded_max]; + band->stats.n_packets += 1; + band->stats.n_bytes += skb->len; + + /* Drop band triggered, let the caller drop the 'skb'. */ + if (band->type == OVS_METER_BAND_TYPE_DROP) { + spin_unlock(&meter->lock); + return true; + } + } + + spin_unlock(&meter->lock); + return false; +} + +static struct genl_ops dp_meter_genl_ops[] = { + { .cmd = OVS_METER_CMD_FEATURES, + .flags = 0, /* OK for unprivileged users. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_features + }, + { .cmd = OVS_METER_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .policy = meter_policy, + .doit = ovs_meter_cmd_set, + }, + { .cmd = OVS_METER_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_get, + }, + { .cmd = OVS_METER_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .policy = meter_policy, + .doit = ovs_meter_cmd_del + }, +}; + +static const struct genl_multicast_group ovs_meter_multicast_group = { + .name = OVS_METER_MCGROUP, +}; + +struct genl_family dp_meter_genl_family __ro_after_init = { + .hdrsize = sizeof(struct ovs_header), + .name = OVS_METER_FAMILY, + .version = OVS_METER_VERSION, + .maxattr = OVS_METER_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_meter_genl_ops, + .n_ops = ARRAY_SIZE(dp_meter_genl_ops), + .mcgrps = &ovs_meter_multicast_group, + .n_mcgrps = 1, + .module = THIS_MODULE, +}; + +int ovs_meters_init(struct datapath *dp) +{ + int i; + + dp->meters = kmalloc_array(METER_HASH_BUCKETS, + sizeof(struct hlist_head), GFP_KERNEL); + + if (!dp->meters) + return -ENOMEM; + + for (i = 0; i < METER_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->meters[i]); + + return 0; +} + +void ovs_meters_exit(struct datapath *dp) +{ + int i; + + for (i = 0; i < METER_HASH_BUCKETS; i++) { + struct hlist_head *head = &dp->meters[i]; + struct dp_meter *meter; + struct hlist_node *n; + + hlist_for_each_entry_safe(meter, n, head, dp_hash_node) + kfree(meter); + } + + kfree(dp->meters); +} diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h new file mode 100644 index 000000000000..964ace2650f8 --- /dev/null +++ b/net/openvswitch/meter.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +#ifndef METER_H +#define METER_H 1 + +#include +#include +#include +#include +#include +#include +#include + +#include "flow.h" +struct datapath; + +#define DP_MAX_BANDS 1 + +struct dp_meter_band { + u32 type; + u32 rate; + u32 burst_size; + u32 bucket; /* 1/1000 packets, or in bits */ + struct ovs_flow_stats stats; +}; + +struct dp_meter { + spinlock_t lock; /* Per meter lock */ + struct rcu_head rcu; + struct hlist_node dp_hash_node; /*Element in datapath->meters + * hash table. + */ + u32 id; + u16 kbps:1, keep_stats:1; + u16 n_bands; + u32 max_delta_t; + u64 used; + struct ovs_flow_stats stats; + struct dp_meter_band bands[]; +}; + +extern struct genl_family dp_meter_genl_family; +int ovs_meters_init(struct datapath *dp); +void ovs_meters_exit(struct datapath *dp); +bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 meter_id); + +#endif /* meter.h */ -- cgit v1.2.3 From cd8a6c33693c1b89d2737ffdbf9611564e9ac907 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 10 Nov 2017 12:09:43 -0800 Subject: openvswitch: Add meter action support Implements OVS kernel meter action support. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 6 ++++++ net/openvswitch/datapath.h | 1 + net/openvswitch/flow_netlink.c | 6 ++++++ 3 files changed, 13 insertions(+) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 9a6a6d51e421..30a5df27116e 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1330,6 +1330,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_POP_NSH: err = pop_nsh(skb, key); break; + + case OVS_ACTION_ATTR_METER: + if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) { + consume_skb(skb); + return 0; + } } if (unlikely(err)) { diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 5d2997b42460..523d65526766 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -30,6 +30,7 @@ #include "conntrack.h" #include "flow.h" #include "flow_table.h" +#include "meter.h" #include "vport-internal_dev.h" #define DP_MAX_PORTS USHRT_MAX diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 4201f9293af3..bb4dae198c78 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -90,6 +90,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SET_MASKED: + case OVS_ACTION_ATTR_METER: default: return true; } @@ -2844,6 +2845,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_POP_ETH] = 0, [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, [OVS_ACTION_ATTR_POP_NSH] = 0, + [OVS_ACTION_ATTR_METER] = sizeof(u32), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3029,6 +3031,10 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, break; } + case OVS_ACTION_ATTR_METER: + /* Non-existent meters are simply ignored. */ + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; -- cgit v1.2.3 From 929fc0327569aa745c9c3cb68a213c22fad3f3f9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:49 +0800 Subject: ip6_gre: add the process for redirect in ip6gre_err This patch is to add redirect icmp packet process for ip6gre by calling ip6_redirect() in ip6gre_err(), as in vti6_err. Prior to this patch, there's even no route cache generated after receiving redirect. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3e10c51e7e0c..0684d0ccaaa5 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -369,6 +369,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { + struct net *net = dev_net(skb->dev); const struct gre_base_hdr *greh; const struct ipv6hdr *ipv6h; int grehlen = sizeof(*greh); @@ -442,6 +443,10 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; return; + case NDISC_REDIRECT: + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + return; } if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) -- cgit v1.2.3 From fe1a4ca0a2b7884672661284666a0b8c183b0b1e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:50 +0800 Subject: ip6_gre: process toobig in a better way Now ip6gre processes toobig icmp packet by setting gre dev's mtu in ip6gre_err, which would cause few things not good: - It couldn't set mtu with dev_set_mtu due to it's not in user context, which causes route cache and idev->cnf.mtu6 not to be updated. - It has to update sk dst pmtu in tx path according to gredev->mtu for ip6gre, while it updates pmtu again according to lower dst pmtu in ip6_tnl_xmit. - To change dev->mtu by toobig icmp packet is not a good idea, it should only work on pmtu. This patch is to process toobig by updating the lower dst's pmtu, as later sk dst pmtu will be updated in ip6_tnl_xmit, the same way as in ip4gre. Note that gre dev's mtu will not be updated any more, it doesn't make any sense to change dev's mtu after receiving a toobig packet. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 0684d0ccaaa5..b90bad7a4e56 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -403,9 +403,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; switch (type) { - __u32 teli; struct ipv6_tlv_tnl_enc_lim *tel; - __u32 mtu; + __u32 teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -436,12 +435,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } return; case ICMPV6_PKT_TOOBIG: - mtu = be32_to_cpu(info) - offset - t->tun_hlen; - if (t->dev->type == ARPHRD_ETHER) - mtu -= ETH_HLEN; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - t->dev->mtu = mtu; + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); return; case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, @@ -508,7 +502,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); - struct dst_entry *dst = skb_dst(skb); __be16 protocol; if (dev->type == ARPHRD_ETHER) @@ -527,10 +520,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); - /* TooBig packet may have updated dst->dev's mtu */ - if (dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); - return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); } -- cgit v1.2.3 From 383c1f88759bba7f387b7b705c31081e5df40b38 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:51 +0800 Subject: ip6_tunnel: add the process for redirect in ip6_tnl_err The same process for redirect in "ip6_gre: add the process for redirect in ip6gre_err" is needed by ip4ip6 and ip6ip6 as well. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 439d65f7e094..a1f704c799d9 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -471,15 +471,16 @@ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { - const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data; - struct ip6_tnl *t; - int rel_msg = 0; + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; + struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; - u8 tproto; __u32 rel_info = 0; - __u16 len; + struct ip6_tnl *t; int err = -ENOENT; + int rel_msg = 0; + u8 tproto; + __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further @@ -543,6 +544,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, rel_msg = 1; } break; + case NDISC_REDIRECT: + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + break; } *type = rel_type; -- cgit v1.2.3 From b00f543240b9423eda73ad06c57becdd5478bc26 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:52 +0800 Subject: ip6_tunnel: process toobig in a better way The same improvement in "ip6_gre: process toobig in a better way" is needed by ip4ip6 and ip6ip6 as well. Note that ip4ip6 and ip6ip6 will also update sk dst pmtu in their err_handlers. Like I said before, gre6 could not do this as it's inner proto is not certain. But for all of them, sk dst pmtu will be updated in tx path if in need. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a1f704c799d9..7e9e205b69d9 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -498,9 +498,8 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, err = 0; switch (*type) { - __u32 teli; struct ipv6_tlv_tnl_enc_lim *tel; - __u32 mtu; + __u32 mtu, teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -531,11 +530,11 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, } break; case ICMPV6_PKT_TOOBIG: + ip6_update_pmtu(skb, net, htonl(*info), 0, 0, + sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - t->dev->mtu = mtu; - len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; -- cgit v1.2.3 From 77552cfa39c48e695c39d0553afc8c6018e411ce Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:53 +0800 Subject: ip6_tunnel: clean up ip4ip6 and ip6ip6's err_handlers This patch is to remove some useless codes of redirect and fix some indents on ip4ip6 and ip6ip6's err_handlers. Note that redirect icmp packet is already processed in ip6_tnl_err, the old redirect codes in ip4ip6_err actually never worked even before this patch. Besides, there's no need to send redirect to user's sk, it's for lower dst, so just remove it in this patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 7e9e205b69d9..00882fdb1223 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -563,13 +563,12 @@ static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - int rel_msg = 0; - u8 rel_type = type; - u8 rel_code = code; __u32 rel_info = ntohl(info); - int err; - struct sk_buff *skb2; const struct iphdr *eiph; + struct sk_buff *skb2; + int err, rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; @@ -594,10 +593,6 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; - case NDISC_REDIRECT: - rel_type = ICMP_REDIRECT; - rel_code = ICMP_REDIR_HOST; - /* fall through */ default: return 0; } @@ -616,33 +611,26 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, eiph = ip_hdr(skb2); /* Try to guess incoming interface */ - rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, - eiph->saddr, 0, - 0, 0, - IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, + 0, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; + ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - rt = NULL; rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, - eiph->daddr, eiph->saddr, - 0, 0, - IPPROTO_IPIP, - RT_TOS(eiph->tos), 0); - if (IS_ERR(rt) || - rt->dst.dev->type != ARPHRD_TUNNEL) { + eiph->daddr, eiph->saddr, 0, 0, + IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { - ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL) @@ -654,10 +642,9 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, rel_info); + skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, + rel_info); } - if (rel_type == ICMP_REDIRECT) - skb_dst(skb2)->ops->redirect(skb_dst(skb2), NULL, skb2); icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -670,11 +657,10 @@ static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - int rel_msg = 0; + __u32 rel_info = ntohl(info); + int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; - __u32 rel_info = ntohl(info); - int err; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); -- cgit v1.2.3 From 18370b36b28a6c1b059392e9b8f9a80332e51e7c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 15 Oct 2017 12:55:23 -0500 Subject: ceph: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva [idryomov@gmail.com: amended "Older OSDs" comment] Signed-off-by: Ilya Dryomov --- net/ceph/ceph_hash.c | 12 +++++++++++- net/ceph/messenger.c | 1 + net/ceph/mon_client.c | 5 +++-- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 67bb1f11e613..9a5850f264ed 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -47,28 +47,38 @@ unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length) /* handle the last 11 bytes */ c = c + length; - switch (len) { /* all the case statements fall through */ + switch (len) { case 11: c = c + ((__u32)k[10] << 24); + /* fall through */ case 10: c = c + ((__u32)k[9] << 16); + /* fall through */ case 9: c = c + ((__u32)k[8] << 8); /* the first byte of c is reserved for the length */ + /* fall through */ case 8: b = b + ((__u32)k[7] << 24); + /* fall through */ case 7: b = b + ((__u32)k[6] << 16); + /* fall through */ case 6: b = b + ((__u32)k[5] << 8); + /* fall through */ case 5: b = b + k[4]; + /* fall through */ case 4: a = a + ((__u32)k[3] << 24); + /* fall through */ case 3: a = a + ((__u32)k[2] << 16); + /* fall through */ case 2: a = a + ((__u32)k[1] << 8); + /* fall through */ case 1: a = a + k[0]; /* case 0: nothing left to add */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ad93342c90d7..8a4d3758030b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -430,6 +430,7 @@ static void ceph_sock_state_change(struct sock *sk) switch (sk->sk_state) { case TCP_CLOSE: dout("%s TCP_CLOSE\n", __func__); + /* fall through */ case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 9ae1bab8c05d..1547107f4854 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1279,9 +1279,10 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, /* * Older OSDs don't set reply tid even if the orignal - * request had a non-zero tid. Workaround this weirdness - * by falling through to the allocate case. + * request had a non-zero tid. Work around this weirdness + * by allocating a new message. */ + /* fall through */ case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: -- cgit v1.2.3 From b11270853fa3654f08d4a6a03b23ddb220512d8d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 6 Nov 2017 21:57:26 -0800 Subject: libceph: don't WARN() if user tries to add invalid key The WARN_ON(!key->len) in set_secret() in net/ceph/crypto.c is hit if a user tries to add a key of type "ceph" with an invalid payload as follows (assuming CONFIG_CEPH_LIB=y): echo -e -n '\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' \ | keyctl padd ceph desc @s This can be hit by fuzzers. As this is merely bad input and not a kernel bug, replace the WARN_ON() with return -EINVAL. Fixes: 7af3ea189a9a ("libceph: stop allocating a new cipher on every crypto request") Cc: # v4.10+ Signed-off-by: Eric Biggers Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/crypto.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 489610ac1cdd..bf9d079cbafd 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -37,7 +37,9 @@ static int set_secret(struct ceph_crypto_key *key, void *buf) return -ENOTSUPP; } - WARN_ON(!key->len); + if (!key->len) + return -EINVAL; + key->key = kmemdup(buf, key->len, GFP_NOIO); if (!key->key) { ret = -ENOMEM; -- cgit v1.2.3 From ee0ab7a2b00fdb65e50ed9e252900b29c8fd802f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 11 Nov 2017 16:29:41 +0100 Subject: net: dsa: Fix dependencies on bridge DSA now uses one of the symbols exported by the bridge, br_vlan_enabled(). This has a stub, if the bridge is not enabled. However, if the bridge is enabled, we cannot have DSA built in and the bridge as a module, otherwise we get undefined symbols at link time: net/dsa/port.o: In function `dsa_port_vlan_add': net/dsa/port.c:255: undefined reference to `br_vlan_enabled' net/dsa/port.o: In function `dsa_port_vlan_del': net/dsa/port.c:270: undefined reference to `br_vlan_enabled' Reported-by: kbuild test robot Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 2fed892094bc..03c3bdf25468 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -7,6 +7,7 @@ config HAVE_NET_DSA config NET_DSA tristate "Distributed Switch Architecture" depends on HAVE_NET_DSA && MAY_USE_DEVLINK + depends on BRIDGE || BRIDGE=n select NET_SWITCHDEV select PHYLIB ---help--- -- cgit v1.2.3 From 0e74aa1d79a5bbc663e03a2804399cae418a0321 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 10 Nov 2017 14:14:06 +1100 Subject: xfrm: Copy policy family in clone_policy The syzbot found an ancient bug in the IPsec code. When we cloned a socket policy (for example, for a child TCP socket derived from a listening socket), we did not copy the family field. This results in a live policy with a zero family field. This triggers a BUG_ON check in the af_key code when the cloned policy is retrieved. This patch fixes it by copying the family field over. Reported-by: syzbot Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 6eb228a70131..2a6093840e7e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1306,6 +1306,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->xfrm_nr = old->xfrm_nr; newp->index = old->index; newp->type = old->type; + newp->family = old->family; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); -- cgit v1.2.3 From 663faeab5c7e205160f9dbeb9a699c5dbdb78ce2 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 22:26:53 +0300 Subject: af_key: replace BUG_ON on WARN_ON in net_exit hook Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index a00d607e7224..3dffb892d52c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3845,7 +3845,7 @@ static void __net_exit pfkey_net_exit(struct net *net) struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_exit_proc(net); - BUG_ON(!hlist_empty(&net_pfkey->table)); + WARN_ON(!hlist_empty(&net_pfkey->table)); } static struct pernet_operations pfkey_net_ops = { -- cgit v1.2.3 From 669f8f1a5c6849d4522de4038c8d1a3e09fcc4ce Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 22:27:49 +0300 Subject: packet: exit_net cleanup check added Be sure that packet.sklist initialized in net_init hook was return to initial state. Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/packet/af_packet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9603f6ff17a4..737092ca9b4e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4560,6 +4560,7 @@ static int __net_init packet_net_init(struct net *net) static void __net_exit packet_net_exit(struct net *net) { remove_proc_entry("packet", net->proc_net); + WARN_ON_ONCE(!hlist_empty(&net->packet.sklist)); } static struct pernet_operations packet_net_ops = { -- cgit v1.2.3 From ee21b18b6bdbb17a6e0b26255d2f662baf0d8409 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 22:28:46 +0300 Subject: netdev: exit_net cleanup check added Be sure that dev_base_head list initialized in net_init hook was return to initial state Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 30b5fe32c525..658337bf33e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8667,6 +8667,8 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); + if (net != &init_net) + WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } static struct pernet_operations __net_initdata netdev_net_ops = { -- cgit v1.2.3 From 0b6f59553544f47caa940c01015ac1f1113f046a Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 22:29:33 +0300 Subject: fib_notifier: exit_net cleanup check added Be sure that fib_notifier_ops list initilized in net_init hook was return to initial state. Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/core/fib_notifier.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 4fc202dbdfb6..6b8cd494574f 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -161,8 +161,14 @@ static int __net_init fib_notifier_net_init(struct net *net) return 0; } +static void __net_exit fib_notifier_net_exit(struct net *net) +{ + WARN_ON_ONCE(!list_empty(&net->fib_notifier_ops)); +} + static struct pernet_operations fib_notifier_net_ops = { .init = fib_notifier_net_init, + .exit = fib_notifier_net_exit, }; static int __init fib_notifier_init(void) -- cgit v1.2.3 From ce2b7db38af1ffefa6b04b5113b4d69ad36d38ba Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 22:30:01 +0300 Subject: fib_rules: exit_net cleanup check added Be sure that rules_ops list initialized in net_init hook was return to initial state. Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/core/fib_rules.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index fafd0a41e3f7..98e1066c3d55 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1022,8 +1022,14 @@ static int __net_init fib_rules_net_init(struct net *net) return 0; } +static void __net_exit fib_rules_net_exit(struct net *net) +{ + WARN_ON_ONCE(!list_empty(&net->rules_ops)); +} + static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, + .exit = fib_rules_net_exit, }; static int __init fib_rules_init(void) -- cgit v1.2.3 From 1e7af3b2cdf2dc9e4fd86c799414b3c504975270 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 22:30:31 +0300 Subject: l2tp: exit_net cleanup check added Be sure that l2tp_session_hlist array initialized in net_init hook was return to initial state. Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 350fcd39ebd8..115918ad8eca 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1833,6 +1833,7 @@ static __net_exit void l2tp_exit_net(struct net *net) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel = NULL; + int hash; rcu_read_lock_bh(); list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { @@ -1842,6 +1843,9 @@ static __net_exit void l2tp_exit_net(struct net *net) flush_workqueue(l2tp_wq); rcu_barrier(); + + for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) + WARN_ON_ONCE(!hlist_empty(&pn->l2tp_session_hlist[hash])); } static struct pernet_operations l2tp_net_ops = { -- cgit v1.2.3 From ae61e8cd061d6ce9a2e2a13e081e7d936139e01e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 22:32:47 +0300 Subject: phonet: exit_net cleanup check added Be sure that pndevs.list initialized in net_init hook was return to initial state. Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/phonet/pn_dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 2cb4c5dfad6f..77787512fc32 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -331,7 +331,10 @@ static int __net_init phonet_init_net(struct net *net) static void __net_exit phonet_exit_net(struct net *net) { + struct phonet_net *pnn = phonet_pernet(net); + remove_proc_entry("phonet", net->proc_net); + WARN_ON_ONCE(!list_empty(&pnn->pndevs.list)); } static struct pernet_operations phonet_net_ops = { -- cgit v1.2.3 From baeb0dbbb56598a2ccd98b56e0da3e9d22869112 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 22:34:03 +0300 Subject: xfrm6_tunnel: exit_net cleanup check added Be sure that spi_byaddr and spi_byspi arrays initialized in net_init hook were return to initial state Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/ipv6/xfrm6_tunnel.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 4e438bc7ee87..f85f0d7480ac 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -338,6 +338,14 @@ static int __net_init xfrm6_tunnel_net_init(struct net *net) static void __net_exit xfrm6_tunnel_net_exit(struct net *net) { + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + unsigned int i; + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) + WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) + WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byspi[i])); } static struct pernet_operations xfrm6_tunnel_net_ops = { -- cgit v1.2.3 From 3a9b76fd0db9f0d426533f96a68a62a58753a51e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 11 Nov 2017 15:54:12 -0800 Subject: tcp: allow drivers to tweak TSQ logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I had many reports that TSQ logic breaks wifi aggregation. Current logic is to allow up to 1 ms of bytes to be queued into qdisc and drivers queues. But Wifi aggregation needs a bigger budget to allow bigger rates to be discovered by various TCP Congestion Controls algorithms. This patch adds an extra socket field, allowing wifi drivers to select another log scale to derive TCP Small Queue credit from current pacing rate. Initial value is 10, meaning that this patch does not change current behavior. We expect wifi drivers to set this field to smaller values (tests have been done with values from 6 to 9) They would have to use following template : if (skb->sk && skb->sk->sk_pacing_shift != MY_PACING_SHIFT) skb->sk->sk_pacing_shift = MY_PACING_SHIFT; Ref: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1670041 Signed-off-by: Eric Dumazet Cc: Johannes Berg Cc: Toke Høiland-Jørgensen Cc: Kir Kolyshkin Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/core/sock.c | 1 + net/ipv4/tcp_output.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 57bbd6040eb6..13719af7b4e3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2746,6 +2746,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; + sk->sk_pacing_shift = 10; sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0256f7a41041..76dbe884f246 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1720,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, { u32 bytes, segs; - bytes = min(sk->sk_pacing_rate >> 10, + bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); /* Goal is to send at least one packet per ms, @@ -2198,7 +2198,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, { unsigned int limit; - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); limit = min_t(u32, limit, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; -- cgit v1.2.3 From 61ef6da622aa7b66bf92991bd272490eea6c712e Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:44 +0200 Subject: tls: Use kzalloc for aead_request allocation Use kzalloc for aead_request allocation as we don't set all the bits in the request. Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7d80040a37b6..f00383a37622 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -219,7 +219,7 @@ static int tls_do_encryption(struct tls_context *tls_ctx, struct aead_request *aead_req; int rc; - aead_req = kmalloc(req_size, flags); + aead_req = kzalloc(req_size, flags); if (!aead_req) return -ENOMEM; -- cgit v1.2.3 From 6d88207fcfddc002afe3e2e4a455e5201089d5d9 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:45 +0200 Subject: tls: Add function to update the TLS socket configuration The tx configuration is now stored in ctx->tx_conf. And sk->sk_prot is updated trough a function This will simplify things when we add rx and support for different possible tx and rx cross configurations. Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_main.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 60aff60e30ad..de6a1416bc41 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -45,8 +45,18 @@ MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); -static struct proto tls_base_prot; -static struct proto tls_sw_prot; +enum { + TLS_BASE_TX, + TLS_SW_TX, + TLS_NUM_CONFIG, +}; + +static struct proto tls_prots[TLS_NUM_CONFIG]; + +static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) +{ + sk->sk_prot = &tls_prots[ctx->tx_conf]; +} int wait_on_pending_writer(struct sock *sk, long *timeo) { @@ -340,8 +350,8 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, { struct tls_crypto_info *crypto_info, tmp_crypto_info; struct tls_context *ctx = tls_get_ctx(sk); - struct proto *prot = NULL; int rc = 0; + int tx_conf; if (!optval || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; @@ -396,11 +406,12 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); - prot = &tls_sw_prot; + tx_conf = TLS_SW_TX; if (rc) goto err_crypto_info; - sk->sk_prot = prot; + ctx->tx_conf = tx_conf; + update_sk_prot(sk, ctx); goto out; err_crypto_info: @@ -453,7 +464,9 @@ static int tls_init(struct sock *sk) icsk->icsk_ulp_data = ctx; ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; - sk->sk_prot = &tls_base_prot; + + ctx->tx_conf = TLS_BASE_TX; + update_sk_prot(sk, ctx); out: return rc; } @@ -464,16 +477,21 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .init = tls_init, }; +static void build_protos(struct proto *prot, struct proto *base) +{ + prot[TLS_BASE_TX] = *base; + prot[TLS_BASE_TX].setsockopt = tls_setsockopt; + prot[TLS_BASE_TX].getsockopt = tls_getsockopt; + + prot[TLS_SW_TX] = prot[TLS_BASE_TX]; + prot[TLS_SW_TX].close = tls_sk_proto_close; + prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; + prot[TLS_SW_TX].sendpage = tls_sw_sendpage; +} + static int __init tls_register(void) { - tls_base_prot = tcp_prot; - tls_base_prot.setsockopt = tls_setsockopt; - tls_base_prot.getsockopt = tls_getsockopt; - - tls_sw_prot = tls_base_prot; - tls_sw_prot.sendmsg = tls_sw_sendmsg; - tls_sw_prot.sendpage = tls_sw_sendpage; - tls_sw_prot.close = tls_sk_proto_close; + build_protos(tls_prots, &tcp_prot); tcp_register_ulp(&tcp_tls_ulp_ops); -- cgit v1.2.3 From ff45d820a2df163957ad8ab459b6eb6976144c18 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:46 +0200 Subject: tls: Fix TLS ulp context leak, when TLS_TX setsockopt is not used. Previously the TLS ulp context would leak if we attached a TLS ulp to a socket but did not use the TLS_TX setsockopt, or did use it but it failed. This patch solves the issue by overriding prot[TLS_BASE_TX].close and fixing tls_sk_proto_close to work properly when its called with ctx->tx_conf == TLS_BASE_TX. This patch also removes ctx->free_resources as we can use ctx->tx_conf to obtain the relevant information. Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_main.c | 22 ++++++++++++++-------- net/tls/tls_sw.c | 4 ++-- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index de6a1416bc41..13427ee7c582 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -226,6 +226,12 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) void (*sk_proto_close)(struct sock *sk, long timeout); lock_sock(sk); + sk_proto_close = ctx->sk_proto_close; + + if (ctx->tx_conf == TLS_BASE_TX) { + kfree(ctx); + goto skip_tx_cleanup; + } if (!tls_complete_pending_work(sk, ctx, 0, &timeo)) tls_handle_open_record(sk, 0); @@ -242,13 +248,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) sg++; } } - ctx->free_resources(sk); + kfree(ctx->rec_seq); kfree(ctx->iv); - sk_proto_close = ctx->sk_proto_close; - kfree(ctx); + if (ctx->tx_conf == TLS_SW_TX) + tls_sw_free_tx_resources(sk); +skip_tx_cleanup: release_sock(sk); sk_proto_close(sk, timeout); } @@ -402,8 +409,6 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, ctx->sk_write_space = sk->sk_write_space; sk->sk_write_space = tls_write_space; - ctx->sk_proto_close = sk->sk_prot->close; - /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); tx_conf = TLS_SW_TX; @@ -464,6 +469,7 @@ static int tls_init(struct sock *sk) icsk->icsk_ulp_data = ctx; ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; + ctx->sk_proto_close = sk->sk_prot->close; ctx->tx_conf = TLS_BASE_TX; update_sk_prot(sk, ctx); @@ -480,11 +486,11 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { static void build_protos(struct proto *prot, struct proto *base) { prot[TLS_BASE_TX] = *base; - prot[TLS_BASE_TX].setsockopt = tls_setsockopt; - prot[TLS_BASE_TX].getsockopt = tls_getsockopt; + prot[TLS_BASE_TX].setsockopt = tls_setsockopt; + prot[TLS_BASE_TX].getsockopt = tls_getsockopt; + prot[TLS_BASE_TX].close = tls_sk_proto_close; prot[TLS_SW_TX] = prot[TLS_BASE_TX]; - prot[TLS_SW_TX].close = tls_sk_proto_close; prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; prot[TLS_SW_TX].sendpage = tls_sw_sendpage; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f00383a37622..fcd92a9c2d06 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -639,7 +639,7 @@ sendpage_end: return ret; } -static void tls_sw_free_resources(struct sock *sk) +void tls_sw_free_tx_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); @@ -650,6 +650,7 @@ static void tls_sw_free_resources(struct sock *sk) tls_free_both_sg(sk); kfree(ctx); + kfree(tls_ctx); } int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) @@ -679,7 +680,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) } ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; - ctx->free_resources = tls_sw_free_resources; crypto_info = &ctx->crypto_send; switch (crypto_info->cipher_type) { -- cgit v1.2.3 From 213ef6e7c9c063c482d77f12cc438872628d48ec Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:47 +0200 Subject: tls: Move tls_make_aad to header to allow sharing move tls_make_aad as it is going to be reused by the device offload code and rx path. Remove unused recv parameter. Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index fcd92a9c2d06..73d19210dd49 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -39,22 +39,6 @@ #include -static inline void tls_make_aad(int recv, - char *buf, - size_t size, - char *record_sequence, - int record_sequence_size, - unsigned char record_type) -{ - memcpy(buf, record_sequence, record_sequence_size); - - buf[8] = record_type; - buf[9] = TLS_1_2_VERSION_MAJOR; - buf[10] = TLS_1_2_VERSION_MINOR; - buf[11] = size >> 8; - buf[12] = size & 0xFF; -} - static void trim_sg(struct sock *sk, struct scatterlist *sg, int *sg_num_elem, unsigned int *sg_size, int target_size) { @@ -249,7 +233,7 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); - tls_make_aad(0, ctx->aad_space, ctx->sg_plaintext_size, + tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, tls_ctx->rec_seq, tls_ctx->rec_seq_size, record_type); -- cgit v1.2.3 From 196c31b4b54474b31dee3c30352c45c2a93e9226 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:48 +0200 Subject: tls: Avoid copying crypto_info again after cipher_type check. Avoid copying crypto_info again after cipher_type check to avoid a TOCTOU exploits. The temporary array on the stack is removed as we don't really need it Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_main.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 13427ee7c582..ab1bd167b63b 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -355,7 +355,7 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, unsigned int optlen) { - struct tls_crypto_info *crypto_info, tmp_crypto_info; + struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); int rc = 0; int tx_conf; @@ -365,36 +365,31 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, goto out; } - rc = copy_from_user(&tmp_crypto_info, optval, sizeof(*crypto_info)); + crypto_info = &ctx->crypto_send; + /* Currently we don't support set crypto info more than one time */ + if (TLS_CRYPTO_INFO_READY(crypto_info)) + goto out; + + rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto out; } /* check version */ - if (tmp_crypto_info.version != TLS_1_2_VERSION) { + if (crypto_info->version != TLS_1_2_VERSION) { rc = -ENOTSUPP; - goto out; + goto err_crypto_info; } - /* get user crypto info */ - crypto_info = &ctx->crypto_send; - - /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) - goto out; - - switch (tmp_crypto_info.cipher_type) { + switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: { if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { rc = -EINVAL; goto out; } - rc = copy_from_user( - crypto_info, - optval, - sizeof(struct tls12_crypto_info_aes_gcm_128)); - + rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), + optlen - sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; -- cgit v1.2.3 From ee181e5201e640a4b92b217e9eab2531dab57d2c Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:49 +0200 Subject: tls: don't override sk_write_space if tls_set_sw_offload fails. If we fail to enable tls in the kernel we shouldn't override the sk_write_space callback Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index ab1bd167b63b..e07ee3ae0023 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -401,9 +401,6 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, goto out; } - ctx->sk_write_space = sk->sk_write_space; - sk->sk_write_space = tls_write_space; - /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); tx_conf = TLS_SW_TX; @@ -412,6 +409,8 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, ctx->tx_conf = tx_conf; update_sk_prot(sk, ctx); + ctx->sk_write_space = sk->sk_write_space; + sk->sk_write_space = tls_write_space; goto out; err_crypto_info: -- cgit v1.2.3 From 51f299dd94bb1e28d03eefbc4fe0b9282f9ee2fa Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Nov 2017 00:15:04 +0100 Subject: net: core: improve sanity checking in __dev_alloc_name __dev_alloc_name is called from the public (and exported) dev_alloc_name(), so we don't have a guarantee that strlen(name) is at most IFNAMSIZ. If somebody manages to get __dev_alloc_name called with a % char beyond the 31st character, we'd be making a snprintf() call that will very easily crash the kernel (using an appropriate %p extension, we'll likely dereference some completely bogus pointer). In the normal case where strlen() is sane, we don't even save anything by limiting to IFNAMSIZ, so just use strchr(). Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 658337bf33e4..1a5d31fdea27 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1064,7 +1064,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) unsigned long *inuse; struct net_device *d; - p = strnchr(name, IFNAMSIZ-1, '%'); + p = strchr(name, '%'); if (p) { /* * Verify the string as this thing may have come from -- cgit v1.2.3 From 2c88b855981481970b731bf3f4508400aac429fb Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Nov 2017 00:15:05 +0100 Subject: net: core: move dev_alloc_name_ns a little higher No functional change. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- net/core/dev.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1a5d31fdea27..4545685d2fa7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1107,6 +1107,19 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) return -ENFILE; } +static int dev_alloc_name_ns(struct net *net, + struct net_device *dev, + const char *name) +{ + char buf[IFNAMSIZ]; + int ret; + + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} + /** * dev_alloc_name - allocate a name for a device * @dev: device @@ -1136,19 +1149,6 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -static int dev_alloc_name_ns(struct net *net, - struct net_device *dev, - const char *name) -{ - char buf[IFNAMSIZ]; - int ret; - - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); - return ret; -} - int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { -- cgit v1.2.3 From c46d7642e915106b0301bc4d53a79e8e806c2eb9 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Nov 2017 00:15:06 +0100 Subject: net: core: eliminate dev_alloc_name{,_ns} code duplication dev_alloc_name contained a BUG_ON(), which I moved to dev_alloc_name_ns; the only other caller of that already has the same BUG_ON. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4545685d2fa7..7580c2046c95 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1114,6 +1114,7 @@ static int dev_alloc_name_ns(struct net *net, char buf[IFNAMSIZ]; int ret; + BUG_ON(!net); ret = __dev_alloc_name(net, name, buf); if (ret >= 0) strlcpy(dev->name, buf, IFNAMSIZ); @@ -1136,16 +1137,7 @@ static int dev_alloc_name_ns(struct net *net, int dev_alloc_name(struct net_device *dev, const char *name) { - char buf[IFNAMSIZ]; - struct net *net; - int ret; - - BUG_ON(!dev_net(dev)); - net = dev_net(dev); - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); - return ret; + return dev_alloc_name_ns(dev_net(dev), dev, name); } EXPORT_SYMBOL(dev_alloc_name); -- cgit v1.2.3 From 6224abda0db8845756571833744d4414f144ecb5 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Nov 2017 00:15:07 +0100 Subject: net: core: drop pointless check in __dev_alloc_name The only caller passes a stack buffer as buf, so it won't equal the passed-in name. Moreover, we're already using buf as a scratch buffer inside the if (p) {} block, so if buf and name were the same, that snprintf() call would be overwriting its own format string. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 7580c2046c95..4cedc7595f1f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1095,8 +1095,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) free_page((unsigned long) inuse); } - if (buf != name) - snprintf(buf, IFNAMSIZ, name, i); + snprintf(buf, IFNAMSIZ, name, i); if (!__dev_get_by_name(net, buf)) return i; -- cgit v1.2.3 From 93809105cf9d43790839d8b8e29a8a505290ec68 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Nov 2017 00:15:08 +0100 Subject: net: core: check dev_valid_name in __dev_alloc_name We currently only exclude non-sysfs-friendly names via dev_get_valid_name; there doesn't seem to be a reason to allow such names when we're called via dev_alloc_name. This does duplicate the dev_valid_name check in the dev_get_valid_name() case; we'll fix that shortly. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4cedc7595f1f..cb3d95edf58d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1064,6 +1064,9 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) unsigned long *inuse; struct net_device *d; + if (!dev_valid_name(name)) + return -EINVAL; + p = strchr(name, '%'); if (p) { /* -- cgit v1.2.3 From d6f295e9def0bee85b37bdffb95153721935c342 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Nov 2017 00:15:09 +0100 Subject: net: core: maybe return -EEXIST in __dev_alloc_name If we're given format string with no %d, -EEXIST is a saner error code. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index cb3d95edf58d..1bb856eaed1c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1106,7 +1106,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) * when the name is long and there isn't enough space left * for the digits, or if all bits are used. */ - return -ENFILE; + return p ? -ENFILE : -EEXIST; } static int dev_alloc_name_ns(struct net *net, -- cgit v1.2.3 From 87c320e51519a83c496ab7bfb4e96c8f9c001e89 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Nov 2017 00:15:10 +0100 Subject: net: core: dev_get_valid_name is now the same as dev_alloc_name_ns If name contains a %, it's easy to see that this patch doesn't change anything (other than eliminate the duplicate dev_valid_name call). Otherwise, we'll now just spend a little time in snprintf() copying name to the stack buffer allocated in dev_alloc_name_ns, and do the __dev_get_by_name using that buffer rather than name. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- net/core/dev.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1bb856eaed1c..ad5f90dacd92 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1146,19 +1146,7 @@ EXPORT_SYMBOL(dev_alloc_name); int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { - BUG_ON(!net); - - if (!dev_valid_name(name)) - return -EINVAL; - - if (strchr(name, '%')) - return dev_alloc_name_ns(net, dev, name); - else if (__dev_get_by_name(net, name)) - return -EEXIST; - else if (dev->name != name) - strlcpy(dev->name, name, IFNAMSIZ); - - return 0; + return dev_alloc_name_ns(net, dev, name); } EXPORT_SYMBOL(dev_get_valid_name); -- cgit v1.2.3 From 1a48fbd9ec1483f5bf3da63dfd907f4272828b4a Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Mon, 13 Nov 2017 14:25:25 +0100 Subject: net: dsa: lan9303: calculate offload_fwd_mark from tag The lan9303 set bits in the host CPU tag indicating if a ingress frame is a trapped IGMP or STP frame. Use these bits to calculate skb->offload_fwd_mark more efficiently. Signed-off-by: Egil Hjelmeland Signed-off-by: David S. Miller --- net/dsa/tag_lan9303.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index b8c5e52b2eff..548c00254c07 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -42,6 +42,10 @@ #define LAN9303_TAG_LEN 4 # define LAN9303_TAG_TX_USE_ALR BIT(3) # define LAN9303_TAG_TX_STP_OVERRIDE BIT(4) +# define LAN9303_TAG_RX_IGMP BIT(3) +# define LAN9303_TAG_RX_STP BIT(4) +# define LAN9303_TAG_RX_TRAPPED_TO_CPU (LAN9303_TAG_RX_IGMP | \ + LAN9303_TAG_RX_STP) /* Decide whether to transmit using ALR lookup, or transmit directly to * port using tag. ALR learning is performed only when using ALR lookup. @@ -91,9 +95,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { u16 *lan9303_tag; + u16 lan9303_tag1; unsigned int source_port; - u16 ether_type_nw; - u8 ip_protocol; if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) { dev_warn_ratelimited(&dev->dev, @@ -114,7 +117,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; } - source_port = ntohs(lan9303_tag[1]) & 0x3; + lan9303_tag1 = ntohs(lan9303_tag[1]); + source_port = lan9303_tag1 & 0x3; skb->dev = dsa_master_find_slave(dev, 0, source_port); if (!skb->dev) { @@ -128,19 +132,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, skb_pull_rcsum(skb, 2 + 2); memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), 2 * ETH_ALEN); - skb->offload_fwd_mark = !ether_addr_equal(skb->data - ETH_HLEN, - eth_stp_addr); - - /* We also need IGMP packets to have skb->offload_fwd_mark = 0. - * Solving this for all conceivable situations would add more cost to - * every packet. Instead we handle just the common case: - * No VLAN tag + Ethernet II framing. - * Test least probable term first. - */ - ether_type_nw = lan9303_tag[2]; - ip_protocol = *(skb->data + 9); - if (ip_protocol == IPPROTO_IGMP && ether_type_nw == htons(ETH_P_IP)) - skb->offload_fwd_mark = 0; + skb->offload_fwd_mark = !(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU); return skb; } -- cgit v1.2.3 From 0c4b9169781cf15c4c1b9171dab98ff96d5c7fd7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 13 Nov 2017 15:57:30 +0100 Subject: netlink: remove unnecessary forward declaration netlink_skb_destructor() is actually defined before the first usage in the file, so remove the unnecessary forward declaration. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index a3eab903a9cd..b9e0ee4e22f5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -128,7 +128,6 @@ static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { }; static int netlink_dump(struct sock *sk); -static void netlink_skb_destructor(struct sk_buff *skb); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion -- cgit v1.2.3 From fbec443bfe44f58a40e00962e969b5a9cafde457 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 13 Nov 2017 18:30:55 +0200 Subject: net: bridge: add vlan_tunnel to bridge port policies Found another missing port flag policy entry for IFLA_BRPORT_VLAN_TUNNEL so add it now. CC: Roopa Prabhu Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 67bae0f11c67..d0ef0a8e8831 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -657,6 +657,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, + [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 }, [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, }; -- cgit v1.2.3 From 8a860c2bcc84a8e4fbcabb928cd97e4c51b17d93 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 14 Nov 2017 06:20:16 +0000 Subject: openvswitch: Fix return value check in ovs_meter_cmd_features() In case of error, the function ovs_meter_cmd_reply_start() returns ERR_PTR() not NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: 96fbc13d7e77 ("openvswitch: Add meter infrastructure") Signed-off-by: Wei Yongjun Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 2a5ba356c472..2e58b6c4c65f 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -166,7 +166,7 @@ static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, &ovs_reply_header); - if (!reply) + if (IS_ERR(reply)) return PTR_ERR(reply); if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) || -- cgit v1.2.3 From 06c2351fdebb38803f10ace19ed8daf9b9c91e12 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 14 Nov 2017 06:27:03 +0000 Subject: openvswitch: Make local function ovs_nsh_key_attr_size() static Fixes the following sparse warnings: net/openvswitch/flow_netlink.c:340:8: warning: symbol 'ovs_nsh_key_attr_size' was not declared. Should it be static? Signed-off-by: Wei Yongjun Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index bb4dae198c78..dc424798ba6f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -338,7 +338,7 @@ size_t ovs_tun_key_attr_size(void) + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ } -size_t ovs_nsh_key_attr_size(void) +static size_t ovs_nsh_key_attr_size(void) { /* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider * updating this function. -- cgit v1.2.3 From 6dc14dc40a1d1dafd8491c349b5f3e15aabc4edb Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 14 Nov 2017 06:27:12 +0000 Subject: openvswitch: Using kfree_rcu() to simplify the code The callback function of call_rcu() just calls a kfree(), so we can use kfree_rcu() instead of call_rcu() + callback function. Signed-off-by: Wei Yongjun Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 2e58b6c4c65f..52ddd6c408b3 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -42,19 +42,12 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; -static void rcu_free_ovs_meter_callback(struct rcu_head *rcu) -{ - struct dp_meter *meter = container_of(rcu, struct dp_meter, rcu); - - kfree(meter); -} - static void ovs_meter_free(struct dp_meter *meter) { if (!meter) return; - call_rcu(&meter->rcu, rcu_free_ovs_meter_callback); + kfree_rcu(meter, rcu); } static struct hlist_head *meter_hash_bucket(const struct datapath *dp, -- cgit v1.2.3 From c92eb77aff6a11c79059e2caffdd3baede218a9e Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 13 Nov 2017 23:21:36 -0800 Subject: net-sysfs: trigger netlink notification on ifalias change via sysfs This patch adds netlink notifications on iflias changes via sysfs. makes it consistent with the netlink path which also calls netdev_state_change. Also makes it consistent with other sysfs netdev_store operations. Signed-off-by: Roopa Prabhu Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 51d5836d8fb9..799b75268291 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -382,7 +382,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; - ssize_t ret; + ssize_t ret = 0; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -391,9 +391,20 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - ret = dev_set_alias(netdev, buf, count); + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); + } +err: + rtnl_unlock(); - return ret < 0 ? ret : len; + return ret; } static ssize_t ifalias_show(struct device *dev, -- cgit v1.2.3 From 094009531612246d9e13f9e0c3ae2205d7f63a0a Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 14 Nov 2017 14:21:32 +0100 Subject: ipv6: set all.accept_dad to 0 by default With commits 35e015e1f577 and a2d3f3e33853, the global 'accept_dad' flag is also taken into account (default value is 1). If either global or per-interface flag is non-zero, DAD will be enabled on a given interface. This is not backward compatible: before those patches, the user could disable DAD just by setting the per-interface flag to 0. Now, the user instead needs to set both flags to 0 to actually disable DAD. Restore the previous behaviour by setting the default for the global 'accept_dad' flag to 0. This way, DAD is still enabled by default, as per-interface flags are set to 1 on device creation, but setting them to 0 is enough to disable DAD on a given interface. - Before 35e015e1f57a7 and a2d3f3e33853: global per-interface DAD enabled [default] 1 1 yes X 0 no X 1 yes - After 35e015e1f577 and a2d3f3e33853: global per-interface DAD enabled [default] 1 1 yes 0 0 no 0 1 yes 1 0 yes - After this fix: global per-interface DAD enabled 1 1 yes 0 0 no [default] 0 1 yes 1 0 yes Fixes: 35e015e1f577 ("ipv6: fix net.ipv6.conf.all interface DAD handlers") Fixes: a2d3f3e33853 ("ipv6: fix net.ipv6.conf.all.accept_dad behaviour for real") CC: Stefano Brivio CC: Matteo Croce CC: Erik Kline Signed-off-by: Nicolas Dichtel Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a6dffd65eb9d..a0ae1c9d37df 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -231,7 +231,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, - .accept_dad = 1, + .accept_dad = 0, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, .stable_secret = { -- cgit v1.2.3 From 11bf284f81b46f59d5f4a4522c13aa7852cfd560 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 14 Nov 2017 16:51:56 +0300 Subject: net: Protect iterations over net::fib_notifier_ops in fib_seq_sum() There is at least unlocked deletion of net->ipv4.fib_notifier_ops from net::fib_notifier_ops: ip_fib_net_exit() rtnl_unlock() fib4_notifier_exit() fib_notifier_ops_unregister(net->ipv4.notifier_ops) list_del_rcu(&ops->list) So fib_seq_sum() can't use rtnl_lock() only for protection. The possible solution could be to use rtnl_lock() in fib_notifier_ops_unregister(), but this adds a possible delay during net namespace creation, so we better use rcu_read_lock() till someone really needs the mutex (if that happens). Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/fib_notifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 6b8cd494574f..0c048bdeb016 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -34,12 +34,14 @@ static unsigned int fib_seq_sum(void) rtnl_lock(); for_each_net(net) { - list_for_each_entry(ops, &net->fib_notifier_ops, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) continue; fib_seq += ops->fib_seq_read(net); module_put(ops->owner); } + rcu_read_unlock(); } rtnl_unlock(); -- cgit v1.2.3 From 6670e152447732ba90626f36dfc015a13fbf150e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Nov 2017 08:25:49 -0800 Subject: tcp: Namespace-ify sysctl_tcp_default_congestion_control Make default TCP default congestion control to a per namespace value. This changes default congestion control to a pointer to congestion ops (rather than implicit as first element of available lsit). The congestion control setting of new namespaces is inherited from the current setting of the root namespace. Signed-off-by: Stephen Hemminger Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 4 +-- net/ipv4/sysctl_net_ipv4.c | 19 +++++++----- net/ipv4/tcp_cong.c | 76 ++++++++++++++++++++++------------------------ net/ipv4/tcp_ipv4.c | 9 ++++++ net/ipv6/route.c | 3 +- 5 files changed, 60 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 589caaa90613..f04d944f8abe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) bool ecn_ca = false; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { val = nla_get_u32(nla); } @@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ef0ff3357a44..93e172118a94 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(ctl->data, struct net, + ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, }; int ret; - tcp_get_default_congestion_control(val); + tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = tcp_set_default_congestion_control(val); + ret = tcp_set_default_congestion_control(net, val); return ret; } @@ -447,12 +449,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_congestion_control", - .mode = 0644, - .maxlen = TCP_CA_NAME_MAX, - .proc_handler = proc_tcp_congestion_control, - }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", @@ -763,6 +759,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &one }, #endif + { + .procname = "tcp_congestion_control", + .data = &init_net.ipv4.tcp_congestion_control, + .mode = 0644, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = proc_tcp_congestion_control, + }, { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2f26124fd160..bc6c02f16243 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) } /* Must be called with rcu lock held */ -static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, + const char *name) { - const struct tcp_congestion_ops *ca = tcp_ca_find(name); + struct tcp_congestion_ops *ca = tcp_ca_find(name); + #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); @@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_congestion_ops *ca; + const struct tcp_congestion_ops *ca; rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (likely(try_module_get(ca->owner))) { - icsk->icsk_ca_ops = ca; - goto out; - } - /* Fallback to next available. The last really - * guaranteed fallback is Reno from this list. - */ - } -out: + ca = rcu_dereference(net->ipv4.tcp_congestion_control); + if (unlikely(!try_module_get(ca->owner))) + ca = &tcp_reno; + icsk->icsk_ca_ops = ca; rcu_read_unlock(); - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else @@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk) } /* Used by sysctl to change default congestion control */ -int tcp_set_default_congestion_control(const char *name) +int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; - int ret = -ENOENT; - - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); -#ifdef CONFIG_MODULES - if (!ca && capable(CAP_NET_ADMIN)) { - spin_unlock(&tcp_cong_list_lock); + const struct tcp_congestion_ops *prev; + int ret; - request_module("tcp_%s", name); - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); - } -#endif + rcu_read_lock(); + ca = tcp_ca_find_autoload(net, name); + if (!ca) { + ret = -ENOENT; + } else if (!try_module_get(ca->owner)) { + ret = -EBUSY; + } else { + prev = xchg(&net->ipv4.tcp_congestion_control, ca); + if (prev) + module_put(prev->owner); - if (ca) { - ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ - list_move(&ca->list, &tcp_cong_list); + ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } - spin_unlock(&tcp_cong_list_lock); + rcu_read_unlock(); return ret; } @@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name) /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { - return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); + return tcp_set_default_congestion_control(&init_net, + CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); @@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) } /* Get current default congestion control */ -void tcp_get_default_congestion_control(char *name) +void tcp_get_default_congestion_control(struct net *net, char *name) { - struct tcp_congestion_ops *ca; - /* We will always have reno... */ - BUG_ON(list_empty(&tcp_cong_list)); + const struct tcp_congestion_ops *ca; rcu_read_lock(); - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } @@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo if (!load) ca = tcp_ca_find(name); else - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(sock_net(sk), name); + /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } + if (!ca) { err = -ENOENT; } else if (!load) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1eac84b8044e..c6bc0c4d19c6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net) { int cpu; + module_put(net->ipv4.tcp_congestion_control->owner); + for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); free_percpu(net->ipv4.tcp_sk); @@ -2522,6 +2524,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Reno is always built in */ + if (!net_eq(net, &init_net) && + try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; + else + net->ipv4.tcp_congestion_control = &tcp_reno; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 70d9659fc1e9..05eb7bc36156 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2378,6 +2378,7 @@ out: static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; bool ecn_ca = false; struct nlattr *nla; int remaining; @@ -2403,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) goto err; } else { -- cgit v1.2.3 From bce552fd6f6e37f9567c85c4f0d6d1987eef379f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Nov 2017 11:27:01 -0800 Subject: netem: use 64 bit divide by rate Since times are now expressed in nanosecond, need to now do true 64 bit divide. Old code would truncate rate at 32 bits. Rename function to better express current usage. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b686e755fda9..644323d6081c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -339,10 +339,8 @@ static s64 tabledist(s64 mu, s64 sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } -static u64 packet_len_2_sched_time(unsigned int len, - struct netem_sched_data *q) +static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) { - u64 offset; len += q->packet_overhead; if (q->cell_size) { @@ -352,9 +350,8 @@ static u64 packet_len_2_sched_time(unsigned int len, cells++; len = cells * (q->cell_size + q->cell_overhead); } - offset = (u64)len * NSEC_PER_SEC; - do_div(offset, q->rate); - return offset; + + return div64_u64(len * NSEC_PER_SEC, q->rate); } static void tfifo_reset(struct Qdisc *sch) @@ -556,7 +553,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, now = last->time_to_send; } - delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); + delay += packet_time_ns(qdisc_pkt_len(skb), q); } cb->time_to_send = now + delay; -- cgit v1.2.3 From 9b0ed89172efec1d9f214d173ad6046f10f6b742 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Nov 2017 11:27:02 -0800 Subject: netem: remove unnecessary 64 bit modulus Fix compilation on 32 bit platforms (where doing modulus operation with 64 bit requires extra glibc functions) by truncation. The jitter for table distribution is limited to a 32 bit value because random numbers are scaled as 32 bit value. Also fix some whitespace. Fixes: 99803171ef04 ("netem: add uapi to express delay and jitter in nanoseconds") Reported-by: Randy Dunlap Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 644323d6081c..dd70924cbcdf 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -312,9 +312,9 @@ static bool loss_event(struct netem_sched_data *q) * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. */ -static s64 tabledist(s64 mu, s64 sigma, +static s64 tabledist(s64 mu, s32 sigma, struct crndstate *state, - const struct disttable *dist) + const struct disttable *dist) { s64 x; long t; @@ -327,7 +327,7 @@ static s64 tabledist(s64 mu, s64 sigma, /* default uniform distribution */ if (dist == NULL) - return (rnd % (2*sigma)) - sigma + mu; + return (rnd % (2 * sigma)) - sigma + mu; t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; -- cgit v1.2.3 From b74912a2fdae9aadd20da502644aa8848c861954 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 14 Nov 2017 14:26:16 -0600 Subject: openvswitch: meter: fix NULL pointer dereference in ovs_meter_cmd_reply_start It seems that the intention of the code is to null check the value returned by function genlmsg_put. But the current code is null checking the address of the pointer that holds the value returned by genlmsg_put. Fix this by properly null checking the value returned by function genlmsg_put in order to avoid a pontential null pointer dereference. Addresses-Coverity-ID: 1461561 ("Dereference before null check") Addresses-Coverity-ID: 1461562 ("Dereference null return value") Fixes: 96fbc13d7e77 ("openvswitch: Add meter infrastructure") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 52ddd6c408b3..3fbfc78991ac 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -99,7 +99,7 @@ ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, *ovs_reply_header = genlmsg_put(skb, info->snd_portid, info->snd_seq, &dp_meter_genl_family, 0, cmd); - if (!ovs_reply_header) { + if (!*ovs_reply_header) { nlmsg_free(skb); return ERR_PTR(-EMSGSIZE); } -- cgit v1.2.3 From 94802151894d482e82c324edf2c658f8e6b96508 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 15 Nov 2017 06:40:57 +0100 Subject: Revert "xfrm: Fix stack-out-of-bounds read in xfrm_state_find." This reverts commit c9f3f813d462c72dbe412cee6a5cbacf13c4ad5e. This commit breaks transport mode when the policy template has widlcard addresses configured, so revert it. Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2a6093840e7e..6bc16bb61b55 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1362,29 +1362,36 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct net *net = xp_net(policy); int nx; int i, error; + xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); + xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; - xfrm_address_t *local; - xfrm_address_t *remote; + xfrm_address_t *remote = daddr; + xfrm_address_t *local = saddr; struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; - remote = &tmpl->id.daddr; - local = &tmpl->saddr; - if (xfrm_addr_any(local, tmpl->encap_family)) { - error = xfrm_get_saddr(net, fl->flowi_oif, - &tmp, remote, - tmpl->encap_family, 0); - if (error) - goto fail; - local = &tmp; + if (tmpl->mode == XFRM_MODE_TUNNEL || + tmpl->mode == XFRM_MODE_BEET) { + remote = &tmpl->id.daddr; + local = &tmpl->saddr; + if (xfrm_addr_any(local, tmpl->encap_family)) { + error = xfrm_get_saddr(net, fl->flowi_oif, + &tmp, remote, + tmpl->encap_family, 0); + if (error) + goto fail; + local = &tmp; + } } x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; + daddr = remote; + saddr = local; continue; } if (x) { -- cgit v1.2.3 From 50895b9de1d3e0258e015e8e55128d835d9a9f19 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Nov 2017 21:02:19 -0800 Subject: tcp: highest_sack fix syzbot easily found a regression added in our latest patches [1] No longer set tp->highest_sack to the head of the send queue since this is not logical and error prone. Only sack processing should maintain the pointer to an skb from rtx queue. We might in the future only remember the sequence instead of a pointer to skb, since rb-tree should allow a fast lookup. [1] BUG: KASAN: use-after-free in tcp_highest_sack_seq include/net/tcp.h:1706 [inline] BUG: KASAN: use-after-free in tcp_ack+0x42bb/0x4fd0 net/ipv4/tcp_input.c:3537 Read of size 4 at addr ffff8801c154faa8 by task syz-executor4/12860 CPU: 0 PID: 12860 Comm: syz-executor4 Not tainted 4.14.0-next-20171113+ #41 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429 tcp_highest_sack_seq include/net/tcp.h:1706 [inline] tcp_ack+0x42bb/0x4fd0 net/ipv4/tcp_input.c:3537 tcp_rcv_established+0x672/0x18a0 net/ipv4/tcp_input.c:5439 tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1468 sk_backlog_rcv include/net/sock.h:909 [inline] __release_sock+0x124/0x360 net/core/sock.c:2264 release_sock+0xa4/0x2a0 net/core/sock.c:2778 tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 ___sys_sendmsg+0x75b/0x8a0 net/socket.c:2048 __sys_sendmsg+0xe5/0x210 net/socket.c:2082 SYSC_sendmsg net/socket.c:2093 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2089 entry_SYSCALL_64_fastpath+0x1f/0x96 RIP: 0033:0x452879 RSP: 002b:00007fc9761bfbe8 EFLAGS: 00000212 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000758020 RCX: 0000000000452879 RDX: 0000000000000000 RSI: 0000000020917fc8 RDI: 0000000000000015 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000212 R12: 00000000006ee3a0 R13: 00000000ffffffff R14: 00007fc9761c06d4 R15: 0000000000000000 Allocated by task 12860: save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489 kmem_cache_alloc_node+0x144/0x760 mm/slab.c:3638 __alloc_skb+0xf1/0x780 net/core/skbuff.c:193 alloc_skb_fclone include/linux/skbuff.h:1023 [inline] sk_stream_alloc_skb+0x11d/0x900 net/ipv4/tcp.c:870 tcp_sendmsg_locked+0x1341/0x3b80 net/ipv4/tcp.c:1299 tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1461 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 SYSC_sendto+0x358/0x5a0 net/socket.c:1749 SyS_sendto+0x40/0x50 net/socket.c:1717 entry_SYSCALL_64_fastpath+0x1f/0x96 Freed by task 12860: save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3492 [inline] kmem_cache_free+0x77/0x280 mm/slab.c:3750 kfree_skbmem+0xdd/0x1d0 net/core/skbuff.c:603 __kfree_skb+0x1d/0x20 net/core/skbuff.c:642 sk_wmem_free_skb include/net/sock.h:1419 [inline] tcp_rtx_queue_unlink_and_free include/net/tcp.h:1682 [inline] tcp_clean_rtx_queue net/ipv4/tcp_input.c:3111 [inline] tcp_ack+0x1b17/0x4fd0 net/ipv4/tcp_input.c:3593 tcp_rcv_established+0x672/0x18a0 net/ipv4/tcp_input.c:5439 tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1468 sk_backlog_rcv include/net/sock.h:909 [inline] __release_sock+0x124/0x360 net/core/sock.c:2264 release_sock+0xa4/0x2a0 net/core/sock.c:2778 tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 ___sys_sendmsg+0x75b/0x8a0 net/socket.c:2048 __sys_sendmsg+0xe5/0x210 net/socket.c:2082 SYSC_sendmsg net/socket.c:2093 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2089 entry_SYSCALL_64_fastpath+0x1f/0x96 The buggy address belongs to the object at ffff8801c154fa80 which belongs to the cache skbuff_fclone_cache of size 456 The buggy address is located 40 bytes inside of 456-byte region [ffff8801c154fa80, ffff8801c154fc48) The buggy address belongs to the page: page:ffffea00070553c0 count:1 mapcount:0 mapping:ffff8801c154f080 index:0x0 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffff8801c154f080 0000000000000000 0000000100000006 raw: ffffea00070a5a20 ffffea0006a18360 ffff8801d9ca0500 0000000000000000 page dumped because: kasan: bad access detected Fixes: 737ff314563c ("tcp: use sequence distance to detect reordering") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c3447c5512fd..f0b572fe959a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3534,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_retransmits = 0; } - prior_fack = tcp_highest_sack_seq(tp); + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet -- cgit v1.2.3 From ca3af4dd28cff4e7216e213ba3b671fbf9f84758 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 15 Nov 2017 16:55:54 +0800 Subject: sctp: do not free asoc when it is already dead in sctp_sendmsg Now in sctp_sendmsg sctp_wait_for_sndbuf could schedule out without holding sock sk. It means the current asoc can be freed elsewhere, like when receiving an abort packet. If the asoc is just created in sctp_sendmsg and sctp_wait_for_sndbuf returns err, the asoc will be freed again due to new_asoc is not nil. An use-after-free issue would be triggered by this. This patch is to fix it by setting new_asoc with nil if the asoc is already dead when cpu schedules back, so that it will not be freed again in sctp_sendmsg. v1->v2: set new_asoc as nil in sctp_sendmsg instead of sctp_wait_for_sndbuf. Suggested-by: Neil Horman Reported-by: Dmitry Vyukov Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b029757bea03..fec8de73a06f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1971,8 +1971,14 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); - if (err) + if (err) { + if (err == -ESRCH) { + /* asoc is already dead. */ + new_asoc = NULL; + err = -EPIPE; + } goto out_free; + } } /* If an address is passed with the sendto/sendmsg call, it is used @@ -8006,10 +8012,11 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, for (;;) { prepare_to_wait_exclusive(&asoc->wait, &wait, TASK_INTERRUPTIBLE); + if (asoc->base.dead) + goto do_dead; if (!*timeo_p) goto do_nonblock; - if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING || - asoc->base.dead) + if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING) goto do_error; if (signal_pending(current)) goto do_interrupted; @@ -8034,6 +8041,10 @@ out: return err; +do_dead: + err = -ESRCH; + goto out; + do_error: err = -EPIPE; goto out; -- cgit v1.2.3 From cea0cc80a6777beb6eb643d4ad53690e1ad1d4ff Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 15 Nov 2017 16:57:26 +0800 Subject: sctp: use the right sk after waking up from wait_buf sleep Commit dfcb9f4f99f1 ("sctp: deny peeloff operation on asocs with threads sleeping on it") fixed the race between peeloff and wait sndbuf by checking waitqueue_active(&asoc->wait) in sctp_do_peeloff(). But it actually doesn't work, as even if waitqueue_active returns false the waiting sndbuf thread may still not yet hold sk lock. After asoc is peeled off, sk is not asoc->base.sk any more, then to hold the old sk lock couldn't make assoc safe to access. This patch is to fix this by changing to hold the new sk lock if sk is not asoc->base.sk, meanwhile, also set the sk in sctp_sendmsg with the new sk. With this fix, there is no more race between peeloff and waitbuf, the check 'waitqueue_active' in sctp_do_peeloff can be removed. Thanks Marcelo and Neil for making this clear. v1->v2: fix it by changing to lock the new sock instead of adding a flag in asoc. Suggested-by: Neil Horman Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index fec8de73a06f..4c0a77292792 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -84,8 +84,8 @@ /* Forward declarations for internal helper functions. */ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); -static int sctp_wait_for_sndbuf(struct sctp_association *, long *timeo_p, - size_t msg_len); +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, + size_t msg_len, struct sock **orig_sk); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1970,7 +1970,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); + /* sk can be changed by peel off when waiting for buf. */ + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk); if (err) { if (err == -ESRCH) { /* asoc is already dead. */ @@ -5021,12 +5022,6 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) if (!asoc) return -EINVAL; - /* If there is a thread waiting on more sndbuf space for - * sending on this asoc, it cannot be peeled. - */ - if (waitqueue_active(&asoc->wait)) - return -EBUSY; - /* An association cannot be branched off from an already peeled-off * socket, nor is this supported for tcp style sockets. */ @@ -7995,7 +7990,7 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len) + size_t msg_len, struct sock **orig_sk) { struct sock *sk = asoc->base.sk; int err = 0; @@ -8029,11 +8024,17 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); + if (sk != asoc->base.sk) { + release_sock(sk); + sk = asoc->base.sk; + lock_sock(sk); + } *timeo_p = current_timeo; } out: + *orig_sk = sk; finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ -- cgit v1.2.3 From 423852f89034492cc40920c00908f6de7d2dbe4f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 15 Nov 2017 17:00:11 +0800 Subject: sctp: check stream reset info len before making reconf chunk Now when resetting stream, if both in and out flags are set, the info len can reach: sizeof(struct sctp_strreset_outreq) + SCTP_MAX_STREAM(65535) + sizeof(struct sctp_strreset_inreq) + SCTP_MAX_STREAM(65535) even without duplicated stream no, this value is far greater than the chunk's max size. _sctp_make_chunk doesn't do any check for this, which would cause the skb it allocs is huge, syzbot even reported a crash due to this. This patch is to check stream reset info len before making reconf chunk and return EINVAL if the len exceeds chunk's capacity. Thanks Marcelo and Neil for making this clear. v1->v2: - move the check into sctp_send_reset_streams instead. Fixes: cc16f00f6529 ("sctp: add support for generating stream reconf ssn reset request chunk") Reported-by: Dmitry Vyukov Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 2 +- net/sctp/stream.c | 32 ++++++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 514465b03829..9bf575f2e8ed 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3594,8 +3594,8 @@ struct sctp_chunk *sctp_make_strreset_req( __u16 stream_num, __be16 *stream_list, bool out, bool in) { + __u16 stream_len = stream_num * sizeof(__u16); struct sctp_strreset_outreq outreq; - __u16 stream_len = stream_num * 2; struct sctp_strreset_inreq inreq; struct sctp_chunk *retval; __u16 outlen, inlen; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index b8c8cabb1a58..a11db21dc8a0 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -282,15 +282,31 @@ int sctp_send_reset_streams(struct sctp_association *asoc, str_nums = params->srs_number_streams; str_list = params->srs_stream_list; - if (out && str_nums) - for (i = 0; i < str_nums; i++) - if (str_list[i] >= stream->outcnt) - goto out; + if (str_nums) { + int param_len = 0; - if (in && str_nums) - for (i = 0; i < str_nums; i++) - if (str_list[i] >= stream->incnt) - goto out; + if (out) { + for (i = 0; i < str_nums; i++) + if (str_list[i] >= stream->outcnt) + goto out; + + param_len = str_nums * sizeof(__u16) + + sizeof(struct sctp_strreset_outreq); + } + + if (in) { + for (i = 0; i < str_nums; i++) + if (str_list[i] >= stream->incnt) + goto out; + + param_len += str_nums * sizeof(__u16) + + sizeof(struct sctp_strreset_inreq); + } + + if (param_len > SCTP_MAX_CHUNK_LEN - + sizeof(struct sctp_reconf_chunk)) + goto out; + } nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); if (!nstr_list) { -- cgit v1.2.3 From 0a833c29d89656025443cb9f0ebff7052dd95ce0 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 15 Nov 2017 13:09:32 +0100 Subject: genetlink: fix genlmsg_nlhdr() According to the description, first argument of genlmsg_nlhdr() points to what genlmsg_put() returns, i.e. beginning of user header. Therefore we should only subtract size of genetlink header and netlink message header, not user header. This also means we don't need to pass the pointer to genetlink family and the same is true for genl_dump_check_consistent() which is the only caller of genlmsg_nlhdr(). (Note that at the moment, these functions are only used for families which do not have user header so that they are not affected.) Fixes: 670dc2833d14 ("netlink: advertise incomplete dumps") Signed-off-by: Michal Kubecek Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- net/nfc/netlink.c | 6 +++--- net/wireless/nl80211.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index f6359c277212..c0b83dc9d993 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -75,7 +75,7 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, if (!hdr) return -EMSGSIZE; - genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) || @@ -603,7 +603,7 @@ static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, return -EMSGSIZE; if (cb) - genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + genl_dump_check_consistent(cb, hdr); if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; @@ -1356,7 +1356,7 @@ static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, goto nla_put_failure; if (cb) - genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bb16f1ec766e..a0e1951227fa 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6291,7 +6291,7 @@ static int nl80211_send_regdom(struct sk_buff *msg, struct netlink_callback *cb, if (!hdr) return -1; - genl_dump_check_consistent(cb, hdr, &nl80211_fam); + genl_dump_check_consistent(cb, hdr); if (nl80211_put_regdom(regdom, msg)) goto nla_put_failure; @@ -7722,7 +7722,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, if (!hdr) return -1; - genl_dump_check_consistent(cb, hdr, &nl80211_fam); + genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->bss_generation)) goto nla_put_failure; -- cgit v1.2.3 From 8252fceac01570836f0cb9e922f56b4c0b1011df Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Nov 2017 18:28:21 +0100 Subject: netfilter: add ifdef around ctnetlink_proto_size This function is no longer marked 'inline', so we now get a warning when it is unused: net/netfilter/nf_conntrack_netlink.c:536:15: error: 'ctnetlink_proto_size' defined but not used [-Werror=unused-function] We could mark it inline again, mark it __maybe_unused, or add an #ifdef around the definition. I'm picking the third approach here since that seems to be what the rest of the file has. Fixes: 5caaed151a68 ("netfilter: conntrack: don't cache nlattr_tuple_size result in nla_size") Signed-off-by: Arnd Bergmann Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_netlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6e0adfefb9ed..59c08997bfdf 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -533,6 +533,7 @@ nla_put_failure: return -1; } +#if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) static size_t ctnetlink_proto_size(const struct nf_conn *ct) { const struct nf_conntrack_l3proto *l3proto; @@ -552,6 +553,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) return len + len4; } +#endif static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { -- cgit v1.2.3 From d618d09a68e4eed7a435beb2e355250f6f40664a Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 15 Nov 2017 21:23:56 +0100 Subject: tipc: enforce valid ratio between skb truesize and contents The socket level flow control is based on the assumption that incoming buffers meet the condition (skb->truesize / roundup(skb->len) <= 4), where the latter value is rounded off upwards to the nearest 1k number. This does empirically hold true for the device drivers we know, but we cannot trust that it will always be so, e.g., in a system with jumbo frames and very small packets. We now introduce a check for this condition at packet arrival, and if we find it to be false, we copy the packet to a new, smaller buffer, where the condition will be true. We expect this to affect only a small fraction of all incoming packets, if at all. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 24 +++++++++++++++++------- net/tipc/msg.h | 7 ++++++- net/tipc/node.c | 2 +- 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 1649d456e22d..b0d07b35909d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -174,7 +174,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == LAST_FRAGMENT) { TIPC_SKB_CB(head)->validated = false; - if (unlikely(!tipc_msg_validate(head))) + if (unlikely(!tipc_msg_validate(&head))) goto err; *buf = head; TIPC_SKB_CB(head)->tail = NULL; @@ -201,11 +201,21 @@ err: * TIPC will ignore the excess, under the assumption that it is optional info * introduced by a later release of the protocol. */ -bool tipc_msg_validate(struct sk_buff *skb) +bool tipc_msg_validate(struct sk_buff **_skb) { - struct tipc_msg *msg; + struct sk_buff *skb = *_skb; + struct tipc_msg *hdr; int msz, hsz; + /* Ensure that flow control ratio condition is satisfied */ + if (unlikely(skb->truesize / buf_roundup_len(skb) > 4)) { + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) + return false; + kfree_skb(*_skb); + *_skb = skb; + } + if (unlikely(TIPC_SKB_CB(skb)->validated)) return true; if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) @@ -217,11 +227,11 @@ bool tipc_msg_validate(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, hsz))) return false; - msg = buf_msg(skb); - if (unlikely(msg_version(msg) != TIPC_VERSION)) + hdr = buf_msg(skb); + if (unlikely(msg_version(hdr) != TIPC_VERSION)) return false; - msz = msg_size(msg); + msz = msg_size(hdr); if (unlikely(msz < hsz)) return false; if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE)) @@ -411,7 +421,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) skb_pull(*iskb, offset); imsz = msg_size(buf_msg(*iskb)); skb_trim(*iskb, imsz); - if (unlikely(!tipc_msg_validate(*iskb))) + if (unlikely(!tipc_msg_validate(iskb))) goto none; *pos += align(imsz); return true; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index bf8f57ccc70c..3e4384c222f7 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -926,7 +926,7 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); -bool tipc_msg_validate(struct sk_buff *skb); +bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); @@ -954,6 +954,11 @@ static inline u16 buf_seqno(struct sk_buff *skb) return msg_seqno(buf_msg(skb)); } +static inline int buf_roundup_len(struct sk_buff *skb) +{ + return (skb->len / 1024 + 1) * 1024; +} + /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any diff --git a/net/tipc/node.c b/net/tipc/node.c index 009a81631280..507017fe0f1b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1539,7 +1539,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) __skb_queue_head_init(&xmitq); /* Ensure message is well-formed before touching the header */ - if (unlikely(!tipc_msg_validate(skb))) + if (unlikely(!tipc_msg_validate(&skb))) goto discard; hdr = buf_msg(skb); usr = msg_user(hdr); -- cgit v1.2.3 From c413af877ffa8a8c28796c9d156cf40a83e58994 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 15 Nov 2017 17:32:49 -0800 Subject: net/rds/ib_fmr.c: use kmalloc_array_node() Now that we have a NUMA-aware version of kmalloc_array() we can use it instead of kmalloc_node() without an overflow check in the size calculation. Link: http://lkml.kernel.org/r/20170927082038.3782-7-jthumshirn@suse.de Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Lameter Cc: Santosh Shilimkar Cc: "David S. Miller" Cc: Christoph Hellwig Cc: Damien Le Moal Cc: David Rientjes Cc: Doug Ledford Cc: Hal Rosenstock Cc: Jens Axboe Cc: Joonsoo Kim Cc: Mike Marciniszyn Cc: Pekka Enberg Cc: Sean Hefty Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/rds/ib_fmr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c index 86ef907067bb..e0f70c4051b6 100644 --- a/net/rds/ib_fmr.c +++ b/net/rds/ib_fmr.c @@ -139,8 +139,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, return -EINVAL; } - dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); + dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); if (!dma_pages) { ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); return -ENOMEM; -- cgit v1.2.3 From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:51 -0800 Subject: kmemcheck: remove annotations Patch series "kmemcheck: kill kmemcheck", v2. As discussed at LSF/MM, kill kmemcheck. KASan is a replacement that is able to work without the limitation of kmemcheck (single CPU, slow). KASan is already upstream. We are also not aware of any users of kmemcheck (or users who don't consider KASan as a suitable replacement). The only objection was that since KASAN wasn't supported by all GCC versions provided by distros at that time we should hold off for 2 years, and try again. Now that 2 years have passed, and all distros provide gcc that supports KASAN, kill kmemcheck again for the very same reasons. This patch (of 4): Remove kmemcheck annotations, and calls to kmemcheck from the kernel. [alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs] Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/skbuff.c | 5 ----- net/core/sock.c | 2 -- net/ipv4/inet_timewait_sock.c | 3 --- net/ipv4/tcp_input.c | 1 - net/socket.c | 1 - 5 files changed, 12 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e140ba49b30a..6cd057b41f34 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -234,14 +233,12 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); - kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; refcount_set(&fclones->fclone_ref, 1); @@ -301,7 +298,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); return skb; } @@ -1283,7 +1279,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (!n) return NULL; - kmemcheck_annotate_bitfield(n, flags1); n->fclone = SKB_FCLONE_UNAVAILABLE; } diff --git a/net/core/sock.c b/net/core/sock.c index 415f441c63b9..78401fa33ce8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1469,8 +1469,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { - kmemcheck_annotate_bitfield(sk, flags); - if (security_sk_alloc(sk, family, priority)) goto out_free; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b039159e67a..d451b9f19b59 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -9,7 +9,6 @@ */ #include -#include #include #include #include @@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, if (tw) { const struct inet_sock *inet = inet_sk(sk); - kmemcheck_annotate_bitfield(tw, flags); - tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 887585045b27..c04d60a677a7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6195,7 +6195,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, if (req) { struct inet_request_sock *ireq = inet_rsk(req); - kmemcheck_annotate_bitfield(ireq, flags); ireq->ireq_opt = NULL; #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; diff --git a/net/socket.c b/net/socket.c index c729625eb5d3..42d8e9c9ccd5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -568,7 +568,6 @@ struct socket *sock_alloc(void) sock = SOCKET_I(inode); - kmemcheck_annotate_bitfield(sock, type); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); -- cgit v1.2.3 From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:38:03 -0800 Subject: mm: remove __GFP_COLD As the page free path makes no distinction between cache hot and cold pages, there is no real useful ordering of pages in the free list that allocation requests can take advantage of. Juding from the users of __GFP_COLD, it is likely that a number of them are the result of copying other sites instead of actually measuring the impact. Remove the __GFP_COLD parameter which simplifies a number of paths in the page allocator. This is potentially controversial but bear in mind that the size of the per-cpu pagelists versus modern cache sizes means that the whole per-cpu list can often fit in the L3 cache. Hence, there is only a potential benefit for microbenchmarks that alloc/free pages in a tight loop. It's even worse when THP is taken into account which has little or no chance of getting a cache-hot page as the per-cpu list is bypassed and the zeroing of multiple pages will thrash the cache anyway. The truncate microbenchmarks are not shown as this patch affects the allocation path and not the free path. A page fault microbenchmark was tested but it showed no sigificant difference which is not surprising given that the __GFP_COLD branches are a miniscule percentage of the fault path. Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6cd057b41f34..9c68555bb906 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -353,7 +353,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { - return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __netdev_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(netdev_alloc_frag); @@ -366,7 +366,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) void *napi_alloc_frag(unsigned int fragsz) { - return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __napi_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(napi_alloc_frag); -- cgit v1.2.3 From 7c8a61d9ee1df0fb4747879fa67a99614eb62fec Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Nov 2017 22:17:48 -0600 Subject: net/sctp: Always set scope_id in sctp_inet6_skb_msgname Alexandar Potapenko while testing the kernel with KMSAN and syzkaller discovered that in some configurations sctp would leak 4 bytes of kernel stack. Working with his reproducer I discovered that those 4 bytes that are leaked is the scope id of an ipv6 address returned by recvmsg. With a little code inspection and a shrewd guess I discovered that sctp_inet6_skb_msgname only initializes the scope_id field for link local ipv6 addresses to the interface index the link local address pertains to instead of initializing the scope_id field for all ipv6 addresses. That is almost reasonable as scope_id's are meaniningful only for link local addresses. Set the scope_id in all other cases to 0 which is not a valid interface index to make it clear there is nothing useful in the scope_id field. There should be no danger of breaking userspace as the stack leak guaranteed that previously meaningless random data was being returned. Fixes: 372f525b495c ("SCTP: Resync with LKSCTP tree.") History-tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Reported-by: Alexander Potapenko Tested-by: Alexander Potapenko Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index a6dfa86c0201..3b18085e3b10 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -807,9 +807,10 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; - if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) { + if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); - } + else + addr->v6.sin6_scope_id = 0; } *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr); -- cgit v1.2.3 From 61433af56077f5fd8815281b44938d84feb04687 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 16 Oct 2017 15:01:06 -0400 Subject: xprtrdma: Throw away reply when version is unrecognized A reply with an unrecognized value in the version field means the transport header is potentially garbled and therefore all the fields are untrustworthy. Fixes: 59aa1f9a3cce3 ("xprtrdma: Properly handle RDMA_ERROR ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index f1889f4d4803..20c9e4cbaa73 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1248,6 +1248,9 @@ rpcrdma_reply_handler(struct work_struct *work) p++; /* credits */ proc = *p++; + if (vers != rpcrdma_version) + goto out_badversion; + if (rpcrdma_is_bcall(r_xprt, rep, xid, proc)) return; @@ -1280,8 +1283,6 @@ rpcrdma_reply_handler(struct work_struct *work) } xprt->reestablish_timeout = 0; - if (vers != rpcrdma_version) - goto out_badversion; switch (proc) { case rdma_msg: @@ -1321,17 +1322,15 @@ out_badstatus: } return; -/* If the incoming reply terminated a pending RPC, the next - * RPC call will post a replacement receive buffer as it is - * being marshaled. - */ out_badversion: dprintk("RPC: %s: invalid version %d\n", __func__, be32_to_cpu(vers)); - status = -EIO; - r_xprt->rx_stats.bad_reply_count++; - goto out; + goto repost; +/* If the incoming reply terminated a pending RPC, the next + * RPC call will post a replacement receive buffer as it is + * being marshaled. + */ out_badheader: dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc)); -- cgit v1.2.3 From 5381e0ec72eeb9467796ac4181ccb7bbce6d3e81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 16 Oct 2017 15:01:14 -0400 Subject: xprtrdma: Move decoded header fields into rpcrdma_rep Clean up: Make it easier to pass the decoded XID, vers, credits, and proc fields around by moving these variables into struct rpcrdma_rep. Note: the credits field will be handled in a subsequent patch. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 36 +++++++++++++++++------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 3 +++ 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 20c9e4cbaa73..e355cd322a32 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -970,14 +970,13 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws, * straightforward to check the RPC header's direction field. */ static bool -rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, - __be32 xid, __be32 proc) +rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) #if defined(CONFIG_SUNRPC_BACKCHANNEL) { struct xdr_stream *xdr = &rep->rr_stream; __be32 *p; - if (proc != rdma_msg) + if (rep->rr_proc != rdma_msg) return false; /* Peek at stream contents without advancing. */ @@ -992,7 +991,7 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, return false; /* RPC header */ - if (*p++ != xid) + if (*p++ != rep->rr_xid) return false; if (*p != cpu_to_be32(RPC_CALL)) return false; @@ -1224,41 +1223,40 @@ rpcrdma_reply_handler(struct work_struct *work) container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct xdr_stream *xdr = &rep->rr_stream; struct rpcrdma_req *req; struct rpc_rqst *rqst; - __be32 *p, xid, vers, proc; unsigned long cwnd; int status; + __be32 *p; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; - xdr_init_decode(xdr, &rep->rr_hdrbuf, + xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base); /* Fixed transport header fields */ - p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); if (unlikely(!p)) goto out_shortreply; - xid = *p++; - vers = *p++; + rep->rr_xid = *p++; + rep->rr_vers = *p++; p++; /* credits */ - proc = *p++; + rep->rr_proc = *p++; - if (vers != rpcrdma_version) + if (rep->rr_vers != rpcrdma_version) goto out_badversion; - if (rpcrdma_is_bcall(r_xprt, rep, xid, proc)) + if (rpcrdma_is_bcall(r_xprt, rep)) return; /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ spin_lock(&xprt->recv_lock); - rqst = xprt_lookup_rqst(xprt, xid); + rqst = xprt_lookup_rqst(xprt, rep->rr_xid); if (!rqst) goto out_norqst; xprt_pin_rqst(rqst); @@ -1267,7 +1265,7 @@ rpcrdma_reply_handler(struct work_struct *work) req->rl_reply = rep; dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", - __func__, rep, req, be32_to_cpu(xid)); + __func__, rep, req, be32_to_cpu(rep->rr_xid)); /* Invalidate and unmap the data payloads before waking the * waiting application. This guarantees the memory regions @@ -1284,7 +1282,7 @@ rpcrdma_reply_handler(struct work_struct *work) xprt->reestablish_timeout = 0; - switch (proc) { + switch (rep->rr_proc) { case rdma_msg: status = rpcrdma_decode_msg(r_xprt, rep, rqst); break; @@ -1324,7 +1322,7 @@ out_badstatus: out_badversion: dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(vers)); + __func__, be32_to_cpu(rep->rr_vers)); goto repost; /* If the incoming reply terminated a pending RPC, the next @@ -1333,7 +1331,7 @@ out_badversion: */ out_badheader: dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc)); + rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc)); r_xprt->rx_stats.bad_reply_count++; status = -EIO; goto out; @@ -1345,7 +1343,7 @@ out_badheader: out_norqst: spin_unlock(&xprt->recv_lock); dprintk("RPC: %s: no match for incoming xid 0x%08x\n", - __func__, be32_to_cpu(xid)); + __func__, be32_to_cpu(rep->rr_xid)); goto repost; out_shortreply: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 74e017477e72..858b4c52047d 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -218,6 +218,9 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; + __be32 rr_xid; + __be32 rr_vers; + __be32 rr_proc; int rr_wc_flags; u32 rr_inv_rkey; struct rpcrdma_regbuf *rr_rdmabuf; -- cgit v1.2.3 From e1352c9610e3235f5e1b159038762d0c01c6ef36 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 16 Oct 2017 15:01:22 -0400 Subject: xprtrdma: Refactor rpcrdma_reply_handler some more Clean up: I'd like to be able to invoke the tail of rpcrdma_reply_handler in two different places. Split the tail out into its own helper function. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 105 ++++++++++++++++++++++------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++++---- 2 files changed, 69 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e355cd322a32..418bcc6b3e1d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1211,6 +1211,60 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, return -EREMOTEIO; } +/* Perform XID lookup, reconstruction of the RPC reply, and + * RPC completion while holding the transport lock to ensure + * the rep, rqst, and rq_task pointers remain stable. + */ +void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) +{ + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpc_rqst *rqst = rep->rr_rqst; + unsigned long cwnd; + int status; + + xprt->reestablish_timeout = 0; + + switch (rep->rr_proc) { + case rdma_msg: + status = rpcrdma_decode_msg(r_xprt, rep, rqst); + break; + case rdma_nomsg: + status = rpcrdma_decode_nomsg(r_xprt, rep); + break; + case rdma_error: + status = rpcrdma_decode_error(r_xprt, rep, rqst); + break; + default: + status = -EIO; + } + if (status < 0) + goto out_badheader; + +out: + spin_lock(&xprt->recv_lock); + cwnd = xprt->cwnd; + xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(rqst->rq_task); + + xprt_complete_rqst(rqst->rq_task, status); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->recv_lock); + return; + +/* If the incoming reply terminated a pending RPC, the next + * RPC call will post a replacement receive buffer as it is + * being marshaled. + */ +out_badheader: + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc)); + r_xprt->rx_stats.bad_reply_count++; + status = -EIO; + goto out; +} + /* Process received RPC/RDMA messages. * * Errors must result in the RPC task either being awakened, or @@ -1225,8 +1279,6 @@ rpcrdma_reply_handler(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpcrdma_req *req; struct rpc_rqst *rqst; - unsigned long cwnd; - int status; __be32 *p; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); @@ -1263,6 +1315,7 @@ rpcrdma_reply_handler(struct work_struct *work) spin_unlock(&xprt->recv_lock); req = rpcr_to_rdmar(rqst); req->rl_reply = rep; + rep->rr_rqst = rqst; dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", __func__, rep, req, be32_to_cpu(rep->rr_xid)); @@ -1280,36 +1333,7 @@ rpcrdma_reply_handler(struct work_struct *work) &req->rl_registered); } - xprt->reestablish_timeout = 0; - - switch (rep->rr_proc) { - case rdma_msg: - status = rpcrdma_decode_msg(r_xprt, rep, rqst); - break; - case rdma_nomsg: - status = rpcrdma_decode_nomsg(r_xprt, rep); - break; - case rdma_error: - status = rpcrdma_decode_error(r_xprt, rep, rqst); - break; - default: - status = -EIO; - } - if (status < 0) - goto out_badheader; - -out: - spin_lock(&xprt->recv_lock); - cwnd = xprt->cwnd; - xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(rqst->rq_task); - - xprt_complete_rqst(rqst->rq_task, status); - xprt_unpin_rqst(rqst); - spin_unlock(&xprt->recv_lock); - dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", - __func__, xprt, rqst, status); + rpcrdma_complete_rqst(rep); return; out_badstatus: @@ -1325,20 +1349,8 @@ out_badversion: __func__, be32_to_cpu(rep->rr_vers)); goto repost; -/* If the incoming reply terminated a pending RPC, the next - * RPC call will post a replacement receive buffer as it is - * being marshaled. - */ -out_badheader: - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc)); - r_xprt->rx_stats.bad_reply_count++; - status = -EIO; - goto out; - -/* The req was still available, but by the time the recv_lock - * was acquired, the rqst and task had been released. Thus the RPC - * has already been terminated. +/* The RPC transaction has already been terminated, or the header + * is corrupt. */ out_norqst: spin_unlock(&xprt->recv_lock); @@ -1348,7 +1360,6 @@ out_norqst: out_shortreply: dprintk("RPC: %s: short/invalid reply\n", __func__); - goto repost; /* If no pending RPC transaction was matched, post a replacement * receive buffer before returning. diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 858b4c52047d..d68a1351d95e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -202,18 +202,17 @@ enum { }; /* - * struct rpcrdma_rep -- this structure encapsulates state required to recv - * and complete a reply, asychronously. It needs several pieces of - * state: - * o recv buffer (posted to provider) - * o ib_sge (also donated to provider) - * o status of reply (length, success or not) - * o bookkeeping state to get run by reply handler (list, etc) + * struct rpcrdma_rep -- this structure encapsulates state required + * to receive and complete an RPC Reply, asychronously. It needs + * several pieces of state: * - * These are allocated during initialization, per-transport instance. + * o receive buffer and ib_sge (donated to provider) + * o status of receive (success or not, length, inv rkey) + * o bookkeeping state to get run by reply handler (XDR stream) * - * N of these are associated with a transport instance, and stored in - * struct rpcrdma_buffer. N is the max number of outstanding requests. + * These structures are allocated during transport initialization. + * N of these are associated with a transport instance, managed by + * struct rpcrdma_buffer. N is the max number of outstanding RPCs. */ struct rpcrdma_rep { @@ -228,6 +227,7 @@ struct rpcrdma_rep { struct work_struct rr_work; struct xdr_buf rr_hdrbuf; struct xdr_stream rr_stream; + struct rpc_rqst *rr_rqst; struct list_head rr_list; struct ib_recv_wr rr_recv_wr; }; @@ -616,6 +616,7 @@ bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct work_struct *work); static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) -- cgit v1.2.3 From d8f532d20ee43a0117284798d486bc4f98e3b196 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 16 Oct 2017 15:01:30 -0400 Subject: xprtrdma: Invoke rpcrdma_reply_handler directly from RECV completion I noticed that the soft IRQ thread looked pretty busy under heavy I/O workloads. perf suggested one area that was expensive was the queue_work() call in rpcrdma_wc_receive. That gave me some ideas. Instead of scheduling a separate worker to process RPC Replies, promote the Receive completion handler to IB_POLL_WORKQUEUE, and invoke rpcrdma_reply_handler directly. Note that the poll workqueue is single-threaded. In order to keep memory invalidation from serializing all RPC Replies, handle any necessary invalidation tasks in a separate multi-threaded workqueue. This provides a two-tier scheme, similar to OS I/O interrupt handlers: A fast interrupt handler that schedules the slow handler and re-enables the interrupt, and a slower handler that is invoked for any needed heavy lifting. Benefits include: - One less context switch for RPCs that don't register memory - Receive completion handling is moved out of soft IRQ context to make room for other users of soft IRQ - The same CPU core now DMA syncs and XDR decodes the Receive buffer Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 46 +++++++++++++++++++++++++---------------- net/sunrpc/xprtrdma/verbs.c | 8 +++---- net/sunrpc/xprtrdma/xprt_rdma.h | 5 ++++- 3 files changed, 36 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 418bcc6b3e1d..430f8b5a8c43 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1265,16 +1265,36 @@ out_badheader: goto out; } +/* Reply handling runs in the poll worker thread. Anything that + * might wait is deferred to a separate workqueue. + */ +void rpcrdma_deferred_completion(struct work_struct *work) +{ + struct rpcrdma_rep *rep = + container_of(work, struct rpcrdma_rep, rr_work); + struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + + /* Invalidate and unmap the data payloads before waking + * the waiting application. This guarantees the memory + * regions are properly fenced from the server before the + * application accesses the data. It also ensures proper + * send flow control: waking the next RPC waits until this + * RPC has relinquished all its Send Queue entries. + */ + rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &req->rl_registered); + + rpcrdma_complete_rqst(rep); +} + /* Process received RPC/RDMA messages. * * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. */ -void -rpcrdma_reply_handler(struct work_struct *work) +void rpcrdma_reply_handler(struct rpcrdma_rep *rep) { - struct rpcrdma_rep *rep = - container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpcrdma_req *req; @@ -1320,20 +1340,10 @@ rpcrdma_reply_handler(struct work_struct *work) dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", __func__, rep, req, be32_to_cpu(rep->rr_xid)); - /* Invalidate and unmap the data payloads before waking the - * waiting application. This guarantees the memory regions - * are properly fenced from the server before the application - * accesses the data. It also ensures proper send flow control: - * waking the next RPC waits until this RPC has relinquished - * all its Send Queue entries. - */ - if (!list_empty(&req->rl_registered)) { - rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, - &req->rl_registered); - } - - rpcrdma_complete_rqst(rep); + if (list_empty(&req->rl_registered)) + rpcrdma_complete_rqst(rep); + else + queue_work(rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 11a1fbf7e59e..d45695408df3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -73,7 +73,7 @@ static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); -static struct workqueue_struct *rpcrdma_receive_wq __read_mostly; +struct workqueue_struct *rpcrdma_receive_wq __read_mostly; int rpcrdma_alloc_wq(void) @@ -185,7 +185,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rpcrdma_update_granted_credits(rep); out_schedule: - queue_work(rpcrdma_receive_wq, &rep->rr_work); + rpcrdma_reply_handler(rep); return; out_fail: @@ -583,7 +583,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, recvcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_recv_wr + 1, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); dprintk("RPC: %s: failed to create recv CQ: %i\n", @@ -974,7 +974,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; - INIT_WORK(&rep->rr_work, rpcrdma_reply_handler); + INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion); rep->rr_recv_wr.next = NULL; rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index d68a1351d95e..a85bcd19b37a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -533,6 +533,8 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); bool frwr_is_supported(struct rpcrdma_ia *); bool fmr_is_supported(struct rpcrdma_ia *); +extern struct workqueue_struct *rpcrdma_receive_wq; + /* * Endpoint calls - xprtrdma/verbs.c */ @@ -617,7 +619,8 @@ void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); -void rpcrdma_reply_handler(struct work_struct *work); +void rpcrdma_reply_handler(struct rpcrdma_rep *rep); +void rpcrdma_deferred_completion(struct work_struct *work); static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) { -- cgit v1.2.3 From be798f9082aa54524b209fac2c8164c81cd28f77 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 16 Oct 2017 15:01:39 -0400 Subject: xprtrdma: Decode credits field in rpcrdma_reply_handler We need to decode and save the incoming rdma_credits field _after_ we know that the direction of the message is "forward direction Reply". Otherwise, the credits value in reverse direction Calls is also used to update the forward direction credits. It is safe to decode the rdma_credits field in rpcrdma_reply_handler now that rpcrdma_reply_handler is single-threaded. Receives complete in the same order as they were sent on the NFS server. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 14 ++++++++++++-- net/sunrpc/xprtrdma/verbs.c | 25 +------------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 14 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 430f8b5a8c43..b8818c09a621 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1244,7 +1244,7 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) out: spin_lock(&xprt->recv_lock); cwnd = xprt->cwnd; - xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; + xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) xprt_release_rqst_cong(rqst->rq_task); @@ -1297,8 +1297,10 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) { struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_req *req; struct rpc_rqst *rqst; + u32 credits; __be32 *p; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); @@ -1315,7 +1317,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) goto out_shortreply; rep->rr_xid = *p++; rep->rr_vers = *p++; - p++; /* credits */ + credits = be32_to_cpu(*p++); rep->rr_proc = *p++; if (rep->rr_vers != rpcrdma_version) @@ -1332,7 +1334,15 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (!rqst) goto out_norqst; xprt_pin_rqst(rqst); + + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > buf->rb_max_requests) + credits = buf->rb_max_requests; + buf->rb_credits = credits; + spin_unlock(&xprt->recv_lock); + req = rpcr_to_rdmar(rqst); req->rl_reply = rep; rep->rr_rqst = rqst; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d45695408df3..247b00b715c2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -133,25 +133,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) wc->status, wc->vendor_err); } -/* Perform basic sanity checking to avoid using garbage - * to update the credit grant value. - */ -static void -rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) -{ - struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf; - __be32 *p = rep->rr_rdmabuf->rg_base; - u32 credits; - - credits = be32_to_cpup(p + 2); - if (credits == 0) - credits = 1; /* don't deadlock */ - else if (credits > buffer->rb_max_requests) - credits = buffer->rb_max_requests; - - atomic_set(&buffer->rb_credits, credits); -} - /** * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC * @cq: completion queue (ignored) @@ -181,9 +162,6 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rdmab_addr(rep->rr_rdmabuf), wc->byte_len, DMA_FROM_DEVICE); - if (wc->byte_len >= RPCRDMA_HDRLEN_ERR) - rpcrdma_update_granted_credits(rep); - out_schedule: rpcrdma_reply_handler(rep); return; @@ -295,7 +273,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; connected: - atomic_set(&xprt->rx_buf.rb_credits, 1); + xprt->rx_buf.rb_credits = 1; ep->rep_connected = connstate; rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); @@ -995,7 +973,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; - atomic_set(&buf->rb_credits, 1); spin_lock_init(&buf->rb_mwlock); spin_lock_init(&buf->rb_lock); spin_lock_init(&buf->rb_recovery_lock); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index a85bcd19b37a..0e0ae6195a5b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -407,7 +407,7 @@ struct rpcrdma_buffer { struct list_head rb_send_bufs; struct list_head rb_recv_bufs; u32 rb_max_requests; - atomic_t rb_credits; /* most recent credit grant */ + u32 rb_credits; /* most recent credit grant */ u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ -- cgit v1.2.3 From ad99f0530710af72b5bbecda9e770c736e92b328 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Oct 2017 10:47:39 -0400 Subject: xprtrdma: Clean up SGE accounting in rpcrdma_prepare_msg_sges() Clean up. rpcrdma_prepare_hdr_sge() sets num_sge to one, then rpcrdma_prepare_msg_sges() sets num_sge again to the count of SGEs it added, plus one for the header SGE just mapped in rpcrdma_prepare_hdr_sge(). This is confusing, and nails in an assumption about when these functions are called. Instead, maintain a running count that both functions can update with just the number of SGEs they have added to the SGE array. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index b8818c09a621..3c9255824d94 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -637,7 +637,7 @@ map_tail: } out: - req->rl_send_wr.num_sge = sge_no + 1; + req->rl_send_wr.num_sge += sge_no; return true; out_mapping_overflow: -- cgit v1.2.3 From 394b2c77cb761fb1382b0e97b7cdff2dd717b5ee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Oct 2017 10:47:47 -0400 Subject: xprtrdma: Fix error handling in rpcrdma_prepare_msg_sges() When this function fails, it needs to undo the DMA mappings it's done so far. Otherwise these are leaked. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 3c9255824d94..4f6c5395d198 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -511,6 +511,28 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return 0; } +/** + * rpcrdma_unmap_sges - DMA-unmap Send buffers + * @ia: interface adapter (device) + * @req: req with possibly some SGEs to be DMA unmapped + * + */ +void +rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_sge *sge; + unsigned int count; + + /* The first two SGEs contain the transport header and + * the inline buffer. These are always left mapped so + * they can be cheaply re-used. + */ + sge = &req->rl_send_sge[2]; + for (count = req->rl_mapped_sges; count--; sge++) + ib_dma_unmap_page(ia->ri_device, + sge->addr, sge->length, DMA_TO_DEVICE); +} + /* Prepare the RPC-over-RDMA header SGE. */ static bool @@ -641,10 +663,12 @@ out: return true; out_mapping_overflow: + rpcrdma_unmap_sges(ia, req); pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); return false; out_mapping_err: + rpcrdma_unmap_sges(ia, req); pr_err("rpcrdma: Send mapping error\n"); return false; } @@ -671,20 +695,6 @@ out_map: return false; } -void -rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) -{ - struct ib_device *device = ia->ri_device; - struct ib_sge *sge; - int count; - - sge = &req->rl_send_sge[2]; - for (count = req->rl_mapped_sges; count--; sge++) - ib_dma_unmap_page(device, sge->addr, sge->length, - DMA_TO_DEVICE); - req->rl_mapped_sges = 0; -} - /** * rpcrdma_marshal_req - Marshal and send one RPC request * @r_xprt: controlling transport -- cgit v1.2.3 From 857f9acab9343788fe59f7be3a4710131b705db4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Oct 2017 10:47:55 -0400 Subject: xprtrdma: Change return value of rpcrdma_prepare_send_sges() Clean up: Make rpcrdma_prepare_send_sges() return a negative errno instead of a bool. Soon callers will want distinct treatments of different types of failures. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 4 +-- net/sunrpc/xprtrdma/rpc_rdma.c | 52 ++++++++++++++++++++++++--------------- net/sunrpc/xprtrdma/xprt_rdma.h | 6 +++-- 3 files changed, 38 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index d31d0ac5ada9..f0d5998330fe 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -222,8 +222,8 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) *p++ = xdr_zero; *p = xdr_zero; - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, - &rqst->rq_snd_buf, rpcrdma_noch)) + if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, + &rqst->rq_snd_buf, rpcrdma_noch)) return -EIO; return 0; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 4f6c5395d198..e3ece9843f9d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -544,7 +544,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) { if (!__rpcrdma_dma_map_regbuf(ia, rb)) - return false; + goto out_regbuf; sge->addr = rdmab_addr(rb); sge->lkey = rdmab_lkey(rb); } @@ -554,6 +554,10 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, sge->length, DMA_TO_DEVICE); req->rl_send_wr.num_sge++; return true; + +out_regbuf: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; } /* Prepare the Send SGEs. The head and tail iovec, and each entry @@ -574,7 +578,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, * DMA-mapped. Sync the content that has changed. */ if (!rpcrdma_dma_map_regbuf(ia, rb)) - return false; + goto out_regbuf; sge_no = 1; sge[sge_no].addr = rdmab_addr(rb); sge[sge_no].length = xdr->head[0].iov_len; @@ -662,6 +666,10 @@ out: req->rl_send_wr.num_sge += sge_no; return true; +out_regbuf: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; + out_mapping_overflow: rpcrdma_unmap_sges(ia, req); pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); @@ -673,26 +681,32 @@ out_mapping_err: return false; } -bool -rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, - u32 hdrlen, struct xdr_buf *xdr, - enum rpcrdma_chunktype rtype) +/** + * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR + * @r_xprt: controlling transport + * @req: context of RPC Call being marshalled + * @hdrlen: size of transport header, in bytes + * @xdr: xdr_buf containing RPC Call + * @rtype: chunk type being encoded + * + * Returns 0 on success; otherwise a negative errno is returned. + */ +int +rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { req->rl_send_wr.num_sge = 0; req->rl_mapped_sges = 0; - if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen)) - goto out_map; + if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) + return -EIO; if (rtype != rpcrdma_areadch) - if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype)) - goto out_map; - - return true; + if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype)) + return -EIO; -out_map: - pr_err("rpcrdma: failed to DMA map a Send buffer\n"); - return false; + return 0; } /** @@ -843,12 +857,10 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) transfertypes[rtype], transfertypes[wtype], xdr_stream_pos(xdr)); - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, - xdr_stream_pos(xdr), - &rqst->rq_snd_buf, rtype)) { - ret = -EIO; + ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), + &rqst->rq_snd_buf, rtype); + if (ret) goto out_err; - } return 0; out_err: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0e0ae6195a5b..0b8ca5e5c706 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -613,8 +613,10 @@ enum rpcrdma_chunktype { rpcrdma_replych }; -bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, - u32, struct xdr_buf *, enum rpcrdma_chunktype); +int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype); void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); -- cgit v1.2.3 From a062a2a3efc5fece106d96d4a5165f3f23b5cbda Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Oct 2017 10:48:03 -0400 Subject: xprtrdma: "Unoptimize" rpcrdma_prepare_hdr_sge() Commit 655fec6987be ("xprtrdma: Use gathered Send for large inline messages") assumed that, since the zeroeth element of the Send SGE array always pointed to req->rl_rdmabuf, it needed to be initialized just once. This was a valid assumption because the Send SGE array and rl_rdmabuf both live in the same rpcrdma_req. In a subsequent patch, the Send SGE array will be separated from the rpcrdma_req, so the zeroeth element of the SGE array needs to be initialized every time. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e3ece9843f9d..7fd102960a81 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -533,7 +533,7 @@ rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) sge->addr, sge->length, DMA_TO_DEVICE); } -/* Prepare the RPC-over-RDMA header SGE. +/* Prepare an SGE for the RPC-over-RDMA transport header. */ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, @@ -542,13 +542,11 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, struct rpcrdma_regbuf *rb = req->rl_rdmabuf; struct ib_sge *sge = &req->rl_send_sge[0]; - if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) { - if (!__rpcrdma_dma_map_regbuf(ia, rb)) - goto out_regbuf; - sge->addr = rdmab_addr(rb); - sge->lkey = rdmab_lkey(rb); - } + if (!rpcrdma_dma_map_regbuf(ia, rb)) + goto out_regbuf; + sge->addr = rdmab_addr(rb); sge->length = len; + sge->lkey = rdmab_lkey(rb); ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); -- cgit v1.2.3 From ae72950abf99fb250aca972b3451b6e06a096c68 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Oct 2017 10:48:12 -0400 Subject: xprtrdma: Add data structure to manage RDMA Send arguments Problem statement: Recently Sagi Grimberg observed that kernel RDMA- enabled storage initiators don't handle delayed Send completion correctly. If Send completion is delayed beyond the end of a ULP transaction, the ULP may release resources that are still being used by the HCA to complete a long-running Send operation. This is a common design trait amongst our initiators. Most Send operations are faster than the ULP transaction they are part of. Waiting for a completion for these is typically unnecessary. Infrequently, a network partition or some other problem crops up where an ordering problem can occur. In NFS parlance, the RPC Reply arrives and completes the RPC, but the HCA is still retrying the Send WR that conveyed the RPC Call. In this case, the HCA can try to use memory that has been invalidated or DMA unmapped, and the connection is lost. If that memory has been re-used for something else (possibly not related to NFS), and the Send retransmission exposes that data on the wire. Thus we cannot assume that it is safe to release Send-related resources just because a ULP reply has arrived. After some analysis, we have determined that the completion housekeeping will not be difficult for xprtrdma: - Inline Send buffers are registered via the local DMA key, and are already left DMA mapped for the lifetime of a transport connection, thus no additional handling is necessary for those - Gathered Sends involving page cache pages _will_ need to DMA unmap those pages after the Send completes. But like inline send buffers, they are registered via the local DMA key, and thus will not need to be invalidated In addition, RPC completion will need to wait for Send completion in the latter case. However, nearly always, the Send that conveys the RPC Call will have completed long before the RPC Reply arrives, and thus no additional latency will be accrued. Design notes: In this patch, the rpcrdma_sendctx object is introduced, and a lock-free circular queue is added to manage a set of them per transport. The RPC client's send path already prevents sending more than one RPC Call at the same time. This allows us to treat the consumer side of the queue (rpcrdma_sendctx_get_locked) as if there is a single consumer thread. The producer side of the queue (rpcrdma_sendctx_put_locked) is invoked only from the Send completion handler, which is a single thread of execution (soft IRQ). The only care that needs to be taken is with the tail index, which is shared between the producer and consumer. Only the producer updates the tail index. The consumer compares the head with the tail to ensure that the a sendctx that is in use is never handed out again (or, expressed more conventionally, the queue is empty). When the sendctx queue empties completely, there are enough Sends outstanding that posting more Send operations can result in a Send Queue overflow. In this case, the ULP is told to wait and try again. This introduces strong Send Queue accounting to xprtrdma. As a final touch, Jason Gunthorpe suggested a mechanism that does not require signaling every Send. We signal once every N Sends, and perform SGE unmapping of N Send operations during that one completion. Reported-by: Sagi Grimberg Suggested-by: Jason Gunthorpe Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 40 +++++---- net/sunrpc/xprtrdma/transport.c | 6 +- net/sunrpc/xprtrdma/verbs.c | 195 ++++++++++++++++++++++++++++++++++++++-- net/sunrpc/xprtrdma/xprt_rdma.h | 38 ++++++-- 4 files changed, 247 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 7fd102960a81..9951c81b82ed 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -512,23 +512,26 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } /** - * rpcrdma_unmap_sges - DMA-unmap Send buffers - * @ia: interface adapter (device) - * @req: req with possibly some SGEs to be DMA unmapped + * rpcrdma_unmap_sendctx - DMA-unmap Send buffers + * @sc: sendctx containing SGEs to unmap * */ void -rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) { + struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia; struct ib_sge *sge; unsigned int count; + dprintk("RPC: %s: unmapping %u sges for sc=%p\n", + __func__, sc->sc_unmap_count, sc); + /* The first two SGEs contain the transport header and * the inline buffer. These are always left mapped so * they can be cheaply re-used. */ - sge = &req->rl_send_sge[2]; - for (count = req->rl_mapped_sges; count--; sge++) + sge = &sc->sc_sges[2]; + for (count = sc->sc_unmap_count; count; ++sge, --count) ib_dma_unmap_page(ia->ri_device, sge->addr, sge->length, DMA_TO_DEVICE); } @@ -539,8 +542,9 @@ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, u32 len) { + struct rpcrdma_sendctx *sc = req->rl_sendctx; struct rpcrdma_regbuf *rb = req->rl_rdmabuf; - struct ib_sge *sge = &req->rl_send_sge[0]; + struct ib_sge *sge = sc->sc_sges; if (!rpcrdma_dma_map_regbuf(ia, rb)) goto out_regbuf; @@ -550,7 +554,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); - req->rl_send_wr.num_sge++; + sc->sc_wr.num_sge++; return true; out_regbuf: @@ -565,10 +569,11 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { + struct rpcrdma_sendctx *sc = req->rl_sendctx; unsigned int sge_no, page_base, len, remaining; struct rpcrdma_regbuf *rb = req->rl_sendbuf; struct ib_device *device = ia->ri_device; - struct ib_sge *sge = req->rl_send_sge; + struct ib_sge *sge = sc->sc_sges; u32 lkey = ia->ri_pd->local_dma_lkey; struct page *page, **ppages; @@ -631,7 +636,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, sge[sge_no].length = len; sge[sge_no].lkey = lkey; - req->rl_mapped_sges++; + sc->sc_unmap_count++; ppages++; remaining -= len; page_base = 0; @@ -657,11 +662,11 @@ map_tail: goto out_mapping_err; sge[sge_no].length = len; sge[sge_no].lkey = lkey; - req->rl_mapped_sges++; + sc->sc_unmap_count++; } out: - req->rl_send_wr.num_sge += sge_no; + sc->sc_wr.num_sge += sge_no; return true; out_regbuf: @@ -669,12 +674,12 @@ out_regbuf: return false; out_mapping_overflow: - rpcrdma_unmap_sges(ia, req); + rpcrdma_unmap_sendctx(sc); pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); return false; out_mapping_err: - rpcrdma_unmap_sges(ia, req); + rpcrdma_unmap_sendctx(sc); pr_err("rpcrdma: Send mapping error\n"); return false; } @@ -694,8 +699,11 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, u32 hdrlen, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { - req->rl_send_wr.num_sge = 0; - req->rl_mapped_sges = 0; + req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf); + if (!req->rl_sendctx) + return -ENOBUFS; + req->rl_sendctx->sc_wr.num_sge = 0; + req->rl_sendctx->sc_unmap_count = 0; if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) return -EIO; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index eb46d2479b09..7be6e2519197 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -687,7 +687,6 @@ xprt_rdma_free(struct rpc_task *task) if (!list_empty(&req->rl_registered)) ia->ri_ops->ro_unmap_sync(r_xprt, &req->rl_registered); - rpcrdma_unmap_sges(ia, req); rpcrdma_buffer_put(req); } @@ -790,11 +789,12 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); - seq_printf(seq, "%lu %lu %lu %lu\n", + seq_printf(seq, "%lu %lu %lu %lu %lu\n", r_xprt->rx_stats.mrs_recovered, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, - r_xprt->rx_stats.local_inv_needed); + r_xprt->rx_stats.local_inv_needed, + r_xprt->rx_stats.empty_sendctx_q); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 247b00b715c2..1bf7b1ee5699 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -52,6 +52,8 @@ #include #include #include + +#include #include #include @@ -126,11 +128,17 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_sendctx *sc = + container_of(cqe, struct rpcrdma_sendctx, sc_cqe); + /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: Send: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); + + rpcrdma_sendctx_put_locked(sc); } /** @@ -542,6 +550,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_sge); /* set trigger for requesting send completion */ + ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH, + cdata->max_requests >> 2); + ep->rep_send_count = ep->rep_send_batch; ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; if (ep->rep_cqinit <= 2) ep->rep_cqinit = 0; /* always signal? */ @@ -824,6 +835,168 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_drain_qp(ia->ri_id->qp); } +/* Fixed-size circular FIFO queue. This implementation is wait-free and + * lock-free. + * + * Consumer is the code path that posts Sends. This path dequeues a + * sendctx for use by a Send operation. Multiple consumer threads + * are serialized by the RPC transport lock, which allows only one + * ->send_request call at a time. + * + * Producer is the code path that handles Send completions. This path + * enqueues a sendctx that has been completed. Multiple producer + * threads are serialized by the ib_poll_cq() function. + */ + +/* rpcrdma_sendctxs_destroy() assumes caller has already quiesced + * queue activity, and ib_drain_qp has flushed all remaining Send + * requests. + */ +static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf) +{ + unsigned long i; + + for (i = 0; i <= buf->rb_sc_last; i++) + kfree(buf->rb_sc_ctxs[i]); + kfree(buf->rb_sc_ctxs); +} + +static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia) +{ + struct rpcrdma_sendctx *sc; + + sc = kzalloc(sizeof(*sc) + + ia->ri_max_send_sges * sizeof(struct ib_sge), + GFP_KERNEL); + if (!sc) + return NULL; + + sc->sc_wr.wr_cqe = &sc->sc_cqe; + sc->sc_wr.sg_list = sc->sc_sges; + sc->sc_wr.opcode = IB_WR_SEND; + sc->sc_cqe.done = rpcrdma_wc_send; + return sc; +} + +static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_sendctx *sc; + unsigned long i; + + /* Maximum number of concurrent outstanding Send WRs. Capping + * the circular queue size stops Send Queue overflow by causing + * the ->send_request call to fail temporarily before too many + * Sends are posted. + */ + i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; + dprintk("RPC: %s: allocating %lu send_ctxs\n", __func__, i); + buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); + if (!buf->rb_sc_ctxs) + return -ENOMEM; + + buf->rb_sc_last = i - 1; + for (i = 0; i <= buf->rb_sc_last; i++) { + sc = rpcrdma_sendctx_create(&r_xprt->rx_ia); + if (!sc) + goto out_destroy; + + sc->sc_xprt = r_xprt; + buf->rb_sc_ctxs[i] = sc; + } + + return 0; + +out_destroy: + rpcrdma_sendctxs_destroy(buf); + return -ENOMEM; +} + +/* The sendctx queue is not guaranteed to have a size that is a + * power of two, thus the helpers in circ_buf.h cannot be used. + * The other option is to use modulus (%), which can be expensive. + */ +static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, + unsigned long item) +{ + return likely(item < buf->rb_sc_last) ? item + 1 : 0; +} + +/** + * rpcrdma_sendctx_get_locked - Acquire a send context + * @buf: transport buffers from which to acquire an unused context + * + * Returns pointer to a free send completion context; or NULL if + * the queue is empty. + * + * Usage: Called to acquire an SGE array before preparing a Send WR. + * + * The caller serializes calls to this function (per rpcrdma_buffer), + * and provides an effective memory barrier that flushes the new value + * of rb_sc_head. + */ +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_xprt *r_xprt; + struct rpcrdma_sendctx *sc; + unsigned long next_head; + + next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head); + + if (next_head == READ_ONCE(buf->rb_sc_tail)) + goto out_emptyq; + + /* ORDER: item must be accessed _before_ head is updated */ + sc = buf->rb_sc_ctxs[next_head]; + + /* Releasing the lock in the caller acts as a memory + * barrier that flushes rb_sc_head. + */ + buf->rb_sc_head = next_head; + + return sc; + +out_emptyq: + /* The queue is "empty" if there have not been enough Send + * completions recently. This is a sign the Send Queue is + * backing up. Cause the caller to pause and try again. + */ + dprintk("RPC: %s: empty sendctx queue\n", __func__); + r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); + r_xprt->rx_stats.empty_sendctx_q++; + return NULL; +} + +/** + * rpcrdma_sendctx_put_locked - Release a send context + * @sc: send context to release + * + * Usage: Called from Send completion to return a sendctxt + * to the queue. + * + * The caller serializes calls to this function (per rpcrdma_buffer). + */ +void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf; + unsigned long next_tail; + + /* Unmap SGEs of previously completed by unsignaled + * Sends by walking up the queue until @sc is found. + */ + next_tail = buf->rb_sc_tail; + do { + next_tail = rpcrdma_sendctx_next(buf, next_tail); + + /* ORDER: item must be accessed _before_ tail is updated */ + rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]); + + } while (buf->rb_sc_ctxs[next_tail] != sc); + + /* Paired with READ_ONCE */ + smp_store_release(&buf->rb_sc_tail, next_tail); +} + static void rpcrdma_mr_recovery_worker(struct work_struct *work) { @@ -919,13 +1092,8 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) spin_lock(&buffer->rb_reqslock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_reqslock); - req->rl_cqe.done = rpcrdma_wc_send; req->rl_buffer = &r_xprt->rx_buf; INIT_LIST_HEAD(&req->rl_registered); - req->rl_send_wr.next = NULL; - req->rl_send_wr.wr_cqe = &req->rl_cqe; - req->rl_send_wr.sg_list = req->rl_send_sge; - req->rl_send_wr.opcode = IB_WR_SEND; return req; } @@ -1017,6 +1185,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) list_add(&rep->rr_list, &buf->rb_recv_bufs); } + rc = rpcrdma_sendctxs_create(r_xprt); + if (rc) + goto out; + return 0; out: rpcrdma_buffer_destroy(buf); @@ -1093,6 +1265,8 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) cancel_delayed_work_sync(&buf->rb_recovery_worker); cancel_delayed_work_sync(&buf->rb_refresh_worker); + rpcrdma_sendctxs_destroy(buf); + while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; @@ -1208,7 +1382,6 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) struct rpcrdma_buffer *buffers = req->rl_buffer; struct rpcrdma_rep *rep = req->rl_reply; - req->rl_send_wr.num_sge = 0; req->rl_reply = NULL; spin_lock(&buffers->rb_lock); @@ -1340,7 +1513,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { - struct ib_send_wr *send_wr = &req->rl_send_wr; + struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; struct ib_send_wr *send_wr_fail; int rc; @@ -1354,7 +1527,13 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, dprintk("RPC: %s: posting %d s/g entries\n", __func__, send_wr->num_sge); - rpcrdma_set_signaled(ep, send_wr); + if (!ep->rep_send_count) { + send_wr->send_flags |= IB_SEND_SIGNALED; + ep->rep_send_count = ep->rep_send_batch; + } else { + send_wr->send_flags &= ~IB_SEND_SIGNALED; + --ep->rep_send_count; + } rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); if (rc) goto out_postsend_err; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0b8ca5e5c706..537cfabe47d1 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -93,6 +93,8 @@ enum { */ struct rpcrdma_ep { + unsigned int rep_send_count; + unsigned int rep_send_batch; atomic_t rep_cqcount; int rep_cqinit; int rep_connected; @@ -232,6 +234,27 @@ struct rpcrdma_rep { struct ib_recv_wr rr_recv_wr; }; +/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes + */ +struct rpcrdma_xprt; +struct rpcrdma_sendctx { + struct ib_send_wr sc_wr; + struct ib_cqe sc_cqe; + struct rpcrdma_xprt *sc_xprt; + unsigned int sc_unmap_count; + struct ib_sge sc_sges[]; +}; + +/* Limit the number of SGEs that can be unmapped during one + * Send completion. This caps the amount of work a single + * completion can do before returning to the provider. + * + * Setting this to zero disables Send completion batching. + */ +enum { + RPCRDMA_MAX_SEND_BATCH = 7, +}; + /* * struct rpcrdma_mw - external memory region metadata * @@ -343,19 +366,16 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; struct xdr_buf rl_hdrbuf; - struct ib_send_wr rl_send_wr; - struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; + struct rpcrdma_sendctx *rl_sendctx; struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ - struct ib_cqe rl_cqe; struct list_head rl_all; bool rl_backchannel; @@ -402,6 +422,11 @@ struct rpcrdma_buffer { struct list_head rb_mws; struct list_head rb_all; + unsigned long rb_sc_head; + unsigned long rb_sc_tail; + unsigned long rb_sc_last; + struct rpcrdma_sendctx **rb_sc_ctxs; + spinlock_t rb_lock; /* protect buf lists */ int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; @@ -456,6 +481,7 @@ struct rpcrdma_stats { unsigned long mrs_recovered; unsigned long mrs_orphaned; unsigned long mrs_allocated; + unsigned long empty_sendctx_q; /* accessed when receiving a reply */ unsigned long long total_rdma_reply; @@ -557,6 +583,8 @@ struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); +void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); @@ -617,7 +645,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, u32 hdrlen, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype); -void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); +void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); -- cgit v1.2.3 From 531cca0c9b17c185377fd081b43ffca953cfecad Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Oct 2017 10:48:20 -0400 Subject: xprtrdma: Add a field of bit flags to struct rpcrdma_req We have one boolean flag in rpcrdma_req today. I'd like to add more flags, so convert that boolean to a bit flag. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 2 +- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 1 - net/sunrpc/xprtrdma/xprt_rdma.h | 7 ++++++- 4 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index f0d5998330fe..088c9cc259d7 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -42,7 +42,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, req = rpcrdma_create_req(r_xprt); if (IS_ERR(req)) return PTR_ERR(req); - req->rl_backchannel = true; + __set_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags); rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, GFP_KERNEL); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 7be6e2519197..acdb2e9c72c8 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -680,7 +680,7 @@ xprt_rdma_free(struct rpc_task *task) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_ia *ia = &r_xprt->rx_ia; - if (req->rl_backchannel) + if (test_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags)) return; dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1bf7b1ee5699..bab63adf070b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1167,7 +1167,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(req); goto out; } - req->rl_backchannel = false; list_add(&req->rl_list, &buf->rb_send_bufs); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 537cfabe47d1..417532069842 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -377,12 +377,17 @@ struct rpcrdma_req { struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ struct list_head rl_all; - bool rl_backchannel; + unsigned long rl_flags; struct list_head rl_registered; /* registered segments */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; +/* rl_flags */ +enum { + RPCRDMA_REQ_F_BACKCHANNEL = 0, +}; + static inline void rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) { -- cgit v1.2.3 From 0ba6f37012db2f88f881cd818aec6e1886f61abb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Oct 2017 10:48:28 -0400 Subject: xprtrdma: Refactor rpcrdma_deferred_completion Invoke a common routine for releasing hardware resources (for example, invalidating MRs). This needs to be done whether an RPC Reply has arrived or the RPC was terminated early. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 26 ++++++++++++++++---------- net/sunrpc/xprtrdma/transport.c | 6 +++--- net/sunrpc/xprtrdma/xprt_rdma.h | 3 +++ 3 files changed, 22 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 9951c81b82ed..853dede38900 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1293,6 +1293,20 @@ out_badheader: goto out; } +void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + /* Invalidate and unmap the data payloads before waking + * the waiting application. This guarantees the memory + * regions are properly fenced from the server before the + * application accesses the data. It also ensures proper + * send flow control: waking the next RPC waits until this + * RPC has relinquished all its Send Queue entries. + */ + if (!list_empty(&req->rl_registered)) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); +} + /* Reply handling runs in the poll worker thread. Anything that * might wait is deferred to a separate workqueue. */ @@ -1301,18 +1315,9 @@ void rpcrdma_deferred_completion(struct work_struct *work) struct rpcrdma_rep *rep = container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); - struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - /* Invalidate and unmap the data payloads before waking - * the waiting application. This guarantees the memory - * regions are properly fenced from the server before the - * application accesses the data. It also ensures proper - * send flow control: waking the next RPC waits until this - * RPC has relinquished all its Send Queue entries. - */ rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &req->rl_registered); - + rpcrdma_release_rqst(rep->rr_rxprt, req); rpcrdma_complete_rqst(rep); } @@ -1374,6 +1379,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) req = rpcr_to_rdmar(rqst); req->rl_reply = rep; rep->rr_rqst = rqst; + clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", __func__, rep, req, be32_to_cpu(rep->rr_xid)); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index acdb2e9c72c8..35aefe201848 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -678,15 +678,14 @@ xprt_rdma_free(struct rpc_task *task) struct rpc_rqst *rqst = task->tk_rqstp; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_ia *ia = &r_xprt->rx_ia; if (test_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags)) return; dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - if (!list_empty(&req->rl_registered)) - ia->ri_ops->ro_unmap_sync(r_xprt, &req->rl_registered); + if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) + rpcrdma_release_rqst(r_xprt, req); rpcrdma_buffer_put(req); } @@ -742,6 +741,7 @@ xprt_rdma_send_request(struct rpc_task *task) goto drop_connection; req->rl_connect_cookie = xprt->connect_cookie; + set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 417532069842..c260475baa36 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -386,6 +386,7 @@ struct rpcrdma_req { /* rl_flags */ enum { RPCRDMA_REQ_F_BACKCHANNEL = 0, + RPCRDMA_REQ_F_PENDING, }; static inline void @@ -655,6 +656,8 @@ int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); +void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req); void rpcrdma_deferred_completion(struct work_struct *work); static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) -- cgit v1.2.3 From 01bb35c89d90abe6fd1c0be001f84bbdfa7fa7d1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Oct 2017 10:48:36 -0400 Subject: xprtrdma: RPC completion should wait for Send completion When an RPC Call includes a file data payload, that payload can come from pages in the page cache, or a user buffer (for direct I/O). If the payload can fit inline, xprtrdma includes it in the Send using a scatter-gather technique. xprtrdma mustn't allow the RPC consumer to re-use the memory where that payload resides before the Send completes. Otherwise, the new contents of that memory would be exposed by an HCA retransmit of the Send operation. So, block RPC completion on Send completion, but only in the case where a separate file data payload is part of the Send. This prevents the reuse of that memory while it is still part of a Send operation without an undue cost to other cases. Waiting is avoided in the common case because typically the Send will have completed long before the RPC Reply arrives. These days, an RPC timeout will trigger a disconnect, which tears down the QP. The disconnect flushes all waiting Sends. This bounds the amount of time the reply handler has to wait for a Send completion. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 26 +++++++++++++++++++++++++- net/sunrpc/xprtrdma/transport.c | 5 +++-- net/sunrpc/xprtrdma/verbs.c | 3 ++- net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++++ 4 files changed, 34 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 853dede38900..4fdeaac6ebe6 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -534,6 +534,11 @@ rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) for (count = sc->sc_unmap_count; count; ++sge, --count) ib_dma_unmap_page(ia->ri_device, sge->addr, sge->length, DMA_TO_DEVICE); + + if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) { + smp_mb__after_atomic(); + wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES); + } } /* Prepare an SGE for the RPC-over-RDMA transport header. @@ -667,6 +672,8 @@ map_tail: out: sc->sc_wr.num_sge += sge_no; + if (sc->sc_unmap_count) + __set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); return true; out_regbuf: @@ -704,6 +711,8 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, return -ENOBUFS; req->rl_sendctx->sc_wr.num_sge = 0; req->rl_sendctx->sc_unmap_count = 0; + req->rl_sendctx->sc_req = req; + __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) return -EIO; @@ -1305,6 +1314,20 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) if (!list_empty(&req->rl_registered)) r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &req->rl_registered); + + /* Ensure that any DMA mapped pages associated with + * the Send of the RPC Call have been unmapped before + * allowing the RPC to complete. This protects argument + * memory not controlled by the RPC client from being + * re-used before we're done with it. + */ + if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { + r_xprt->rx_stats.reply_waits_for_send++; + out_of_line_wait_on_bit(&req->rl_flags, + RPCRDMA_REQ_F_TX_RESOURCES, + bit_wait, + TASK_UNINTERRUPTIBLE); + } } /* Reply handling runs in the poll worker thread. Anything that @@ -1384,7 +1407,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", __func__, rep, req, be32_to_cpu(rep->rr_xid)); - if (list_empty(&req->rl_registered)) + if (list_empty(&req->rl_registered) && + !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) rpcrdma_complete_rqst(rep); else queue_work(rpcrdma_receive_wq, &rep->rr_work); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 35aefe201848..9fdd11e4758c 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -789,12 +789,13 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); - seq_printf(seq, "%lu %lu %lu %lu %lu\n", + seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", r_xprt->rx_stats.mrs_recovered, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, r_xprt->rx_stats.local_inv_needed, - r_xprt->rx_stats.empty_sendctx_q); + r_xprt->rx_stats.empty_sendctx_q, + r_xprt->rx_stats.reply_waits_for_send); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index bab63adf070b..9a824fe8ffc2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1526,7 +1526,8 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, dprintk("RPC: %s: posting %d s/g entries\n", __func__, send_wr->num_sge); - if (!ep->rep_send_count) { + if (!ep->rep_send_count || + test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { send_wr->send_flags |= IB_SEND_SIGNALED; ep->rep_send_count = ep->rep_send_batch; } else { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c260475baa36..bccd5d8b9384 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -236,11 +236,13 @@ struct rpcrdma_rep { /* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes */ +struct rpcrdma_req; struct rpcrdma_xprt; struct rpcrdma_sendctx { struct ib_send_wr sc_wr; struct ib_cqe sc_cqe; struct rpcrdma_xprt *sc_xprt; + struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; }; @@ -387,6 +389,7 @@ struct rpcrdma_req { enum { RPCRDMA_REQ_F_BACKCHANNEL = 0, RPCRDMA_REQ_F_PENDING, + RPCRDMA_REQ_F_TX_RESOURCES, }; static inline void @@ -492,6 +495,7 @@ struct rpcrdma_stats { /* accessed when receiving a reply */ unsigned long long total_rdma_reply; unsigned long long fixup_copy_count; + unsigned long reply_waits_for_send; unsigned long local_inv_needed; unsigned long nomsg_call_count; unsigned long bcall_count; -- cgit v1.2.3 From 6f0afc28257dfa769c210f8f8da0f21d77e7452f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Oct 2017 10:48:45 -0400 Subject: xprtrdma: Remove atomic send completion counting The sendctx circular queue now guarantees that xprtrdma cannot overflow the Send Queue, so remove the remaining bits of the original Send WQE counting mechanism. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 8 -------- net/sunrpc/xprtrdma/verbs.c | 4 ---- net/sunrpc/xprtrdma/xprt_rdma.h | 21 --------------------- 3 files changed, 33 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 3053fb0f5cb3..404166ac958f 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -419,7 +419,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; - rpcrdma_set_signaled(&r_xprt->rx_ep, ®_wr->wr); rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); if (rc) goto out_senderr; @@ -507,12 +506,6 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) f->fr_cqe.done = frwr_wc_localinv_wake; reinit_completion(&f->fr_linv_done); - /* Initialize CQ count, since there is always a signaled - * WR being posted here. The new cqcount depends on how - * many SQEs are about to be consumed. - */ - rpcrdma_init_cqcount(&r_xprt->rx_ep, count); - /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us * unless ri_id->qp is a valid pointer. @@ -545,7 +538,6 @@ reset_mrs: /* Find and reset the MRs in the LOCAL_INV WRs that did not * get posted. */ - rpcrdma_init_cqcount(&r_xprt->rx_ep, -count); while (bad_wr) { f = container_of(bad_wr, struct rpcrdma_frmr, fr_invwr); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 9a824fe8ffc2..22128a81da63 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -553,10 +553,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH, cdata->max_requests >> 2); ep->rep_send_count = ep->rep_send_batch; - ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; - if (ep->rep_cqinit <= 2) - ep->rep_cqinit = 0; /* always signal? */ - rpcrdma_init_cqcount(ep, 0); init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index bccd5d8b9384..6e64c8259d34 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -95,8 +95,6 @@ enum { struct rpcrdma_ep { unsigned int rep_send_count; unsigned int rep_send_batch; - atomic_t rep_cqcount; - int rep_cqinit; int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; @@ -106,25 +104,6 @@ struct rpcrdma_ep { struct delayed_work rep_connect_worker; }; -static inline void -rpcrdma_init_cqcount(struct rpcrdma_ep *ep, int count) -{ - atomic_set(&ep->rep_cqcount, ep->rep_cqinit - count); -} - -/* To update send queue accounting, provider must take a - * send completion every now and then. - */ -static inline void -rpcrdma_set_signaled(struct rpcrdma_ep *ep, struct ib_send_wr *send_wr) -{ - send_wr->send_flags = 0; - if (unlikely(atomic_sub_return(1, &ep->rep_cqcount) <= 0)) { - rpcrdma_init_cqcount(ep, 0); - send_wr->send_flags = IB_SEND_SIGNALED; - } -} - /* Pre-allocate extra Work Requests for handling backward receives * and sends. This is a fixed value because the Work Queues are * allocated when the forward channel is set up. -- cgit v1.2.3 From a4699f5647f369e8ab7ec56b7cd98580c933c3f3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Oct 2017 16:21:49 -0400 Subject: xprtrdma: Put Send CQ in IB_POLL_WORKQUEUE mode Lift the Send and LocalInv completion handlers out of soft IRQ mode to make room for other work. Also, move the Send CQ to a different CPU than the CPU where the Receive CQ is running, for improved scalability. Signed-off-by: Chuck Lever Reviewed-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 22128a81da63..4cfa893def2c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -558,7 +558,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, - 0, IB_POLL_SOFTIRQ); + 1, IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); dprintk("RPC: %s: failed to create send CQ: %i\n", -- cgit v1.2.3 From 2232df5ece121fd7049ccff95cbb3acfab278d75 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Oct 2017 16:21:57 -0400 Subject: rpcrdma: Remove C structure definitions of XDR data items Clean up: C-structure style XDR encoding and decoding logic has been replaced over the past several merge windows on both the client and server. These data structures are no longer used. Signed-off-by: Chuck Lever Reviewed-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 6 +++--- net/sunrpc/xprtrdma/xprt_rdma.h | 6 ------ 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 4fdeaac6ebe6..45cb5497b37f 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -75,11 +75,11 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) /* Maximum Read list size */ maxsegs += 2; /* segment for head and tail buffers */ - size = maxsegs * sizeof(struct rpcrdma_read_chunk); + size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ size += sizeof(__be32); /* segment count */ - size += sizeof(struct rpcrdma_segment); + size += rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ dprintk("RPC: %s: max call header size = %u\n", @@ -102,7 +102,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) /* Maximum Write list size */ maxsegs += 2; /* segment for head and tail buffers */ size = sizeof(__be32); /* segment count */ - size += maxsegs * sizeof(struct rpcrdma_segment); + size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ dprintk("RPC: %s: max reply header size = %u\n", diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 6e64c8259d34..8b9954c41ee4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -145,12 +145,6 @@ rdmab_lkey(struct rpcrdma_regbuf *rb) return rb->rg_iov.lkey; } -static inline struct rpcrdma_msg * -rdmab_to_msg(struct rpcrdma_regbuf *rb) -{ - return (struct rpcrdma_msg *)rb->rg_base; -} - static inline struct ib_device * rdmab_device(struct rpcrdma_regbuf *rb) { -- cgit v1.2.3 From 1b746c1e9c1c9eea9eab9e3c1879281614717b28 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Oct 2017 16:22:06 -0400 Subject: xprtrdma: Remove include for linux/prefetch.h Clean up. This include should have been removed by commit 23826c7aeac7 ("xprtrdma: Serialize credit accounting again"). Signed-off-by: Chuck Lever Reviewed-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 4cfa893def2c..be61c29432d0 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -49,7 +49,6 @@ #include #include -#include #include #include -- cgit v1.2.3 From 62b56a675565a2e40f2cdf50455977448fd87413 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Oct 2017 16:22:14 -0400 Subject: xprtrdma: Update copyright notices Credit work contributed by Oracle engineers since 2014. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 1 + net/sunrpc/xprtrdma/transport.c | 1 + net/sunrpc/xprtrdma/verbs.c | 1 + net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 4 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 45cb5497b37f..ed34dc0f144c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 9fdd11e4758c..646c24494ea7 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index be61c29432d0..710b3f77db82 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8b9954c41ee4..51686d9eac5f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two -- cgit v1.2.3 From e9d476393504ff0f9ab38d88d7857ec6a2c81ff6 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Oct 2017 11:48:30 -0500 Subject: net: sunrpc: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 11 +++++++++++ net/sunrpc/xprt.c | 1 + net/sunrpc/xprtsock.c | 4 ++++ 3 files changed, 16 insertions(+) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2ad827db2704..d25b077d326d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1586,6 +1586,7 @@ call_reserveresult(struct rpc_task *task) switch (status) { case -ENOMEM: rpc_delay(task, HZ >> 2); + /* fall through */ case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; return; @@ -1647,10 +1648,13 @@ call_refreshresult(struct rpc_task *task) /* Use rate-limiting and a max number of retries if refresh * had status 0 but failed to update the cred. */ + /* fall through */ case -ETIMEDOUT: rpc_delay(task, 3*HZ); + /* fall through */ case -EAGAIN: status = -EACCES; + /* fall through */ case -EKEYEXPIRED: if (!task->tk_cred_retry) break; @@ -1911,6 +1915,7 @@ call_connect_status(struct rpc_task *task) task->tk_action = call_bind; return; } + /* fall through */ case -ECONNRESET: case -ECONNABORTED: case -ENETUNREACH: @@ -1924,6 +1929,7 @@ call_connect_status(struct rpc_task *task) break; /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); + /* fall through */ case -EAGAIN: /* Check for timeouts before looping back to call_bind */ case -ETIMEDOUT: @@ -2025,6 +2031,7 @@ call_transmit_status(struct rpc_task *task) rpc_exit(task, task->tk_status); break; } + /* fall through */ case -ECONNRESET: case -ECONNABORTED: case -EADDRINUSE: @@ -2145,6 +2152,7 @@ call_status(struct rpc_task *task) * were a timeout. */ rpc_delay(task, 3*HZ); + /* fall through */ case -ETIMEDOUT: task->tk_action = call_timeout; break; @@ -2152,14 +2160,17 @@ call_status(struct rpc_task *task) case -ECONNRESET: case -ECONNABORTED: rpc_force_rebind(clnt); + /* fall through */ case -EADDRINUSE: rpc_delay(task, 3*HZ); + /* fall through */ case -EPIPE: case -ENOTCONN: task->tk_action = call_bind; break; case -ENOBUFS: rpc_delay(task, HZ>>2); + /* fall through */ case -EAGAIN: task->tk_action = call_transmit; break; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e741ec2b4d8e..02a9bacb239b 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1139,6 +1139,7 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) case -EAGAIN: xprt_add_backlog(xprt, task); dprintk("RPC: waiting for request slot\n"); + /* fall through */ default: task->tk_status = -EAGAIN; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c1841f234a71..684e356b40e4 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -551,6 +551,7 @@ static int xs_local_send_request(struct rpc_task *task) default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); + /* fall through */ case -EPIPE: xs_close(xprt); status = -ENOTCONN; @@ -1610,6 +1611,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->connect_cookie++; clear_bit(XPRT_CONNECTED, &xprt->state); xs_tcp_force_close(xprt); + /* fall through */ case TCP_CLOSING: /* * If the server closed down the connection, make sure that @@ -2367,6 +2369,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) switch (ret) { case 0: xs_set_srcport(transport, sock); + /* fall through */ case -EINPROGRESS: /* SYN_SENT! */ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) @@ -2418,6 +2421,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) default: printk("%s: connect returned unhandled error %d\n", __func__, status); + /* fall through */ case -EADDRNOTAVAIL: /* We're probably in TIME_WAIT. Get rid of existing socket, * and retry -- cgit v1.2.3 From b2bfe5915d5fe7577221031a39ac722a0a2a1199 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Nov 2017 13:46:06 -0400 Subject: sunrpc: Fix rpc_task_begin trace point The rpc_task_begin trace point always display a task ID of zero. Move the trace point call site so that it picks up the new task ID. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 0cc83839c13c..f9db5fe52d36 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -274,10 +274,9 @@ static inline void rpc_task_set_debuginfo(struct rpc_task *task) static void rpc_set_active(struct rpc_task *task) { - trace_rpc_task_begin(task->tk_client, task, NULL); - rpc_task_set_debuginfo(task); set_bit(RPC_TASK_ACTIVE, &task->tk_runstate); + trace_rpc_task_begin(task->tk_client, task, NULL); } /* -- cgit v1.2.3 From c435da68b6d1adc71d46b7833bf2c568e4420839 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Nov 2017 13:46:14 -0400 Subject: sunrpc: Add rpc_request static trace point Display information about the RPC procedure being requested in the trace log. This sometimes critical information cannot always be derived from other RPC trace entries. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d25b077d326d..a801da812f86 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1491,7 +1491,6 @@ rpc_restart_call(struct rpc_task *task) } EXPORT_SYMBOL_GPL(rpc_restart_call); -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) const char *rpc_proc_name(const struct rpc_task *task) { @@ -1505,7 +1504,6 @@ const char } else return "no proc"; } -#endif /* * 0. Initial state @@ -1519,6 +1517,7 @@ call_start(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; int idx = task->tk_msg.rpc_proc->p_statidx; + trace_rpc_request(task); dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid, clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task), -- cgit v1.2.3 From 4112be70becb82bc9a53cf2d11ab51c35602b063 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 11:48:43 +0300 Subject: sunrpc: exit_net cleanup check added Be sure that all_clients list initialized in net_init hook was return to initial state. Signed-off-by: Vasily Averin Signed-off-by: Anna Schumaker --- net/sunrpc/sunrpc_syms.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index c73de181467a..56f9eff74150 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -65,10 +65,13 @@ err_proc: static __net_exit void sunrpc_exit_net(struct net *net) { + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + rpc_pipefs_exit_net(net); unix_gid_cache_destroy(net); ip_map_cache_destroy(net); rpc_proc_exit(net); + WARN_ON_ONCE(!list_empty(&sn->all_clients)); } static struct pernet_operations sunrpc_net_ops = { -- cgit v1.2.3 From 6c67a3e4a46a95c8aa8228dafb3676bc1a9b4871 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 8 Nov 2017 08:57:32 +0300 Subject: sunrpc: remove net pointer from messages Publishing of net pointer is not safe, use net->ns.inum as net ID [ 171.391947] RPC: created new rpcb local clients (rpcb_local_clnt: ..., rpcb_local_clnt4: ...) for net f00001e7 [ 171.767188] NFSD: starting 90-second grace period (net f00001e7) Signed-off-by: Vasily Averin Signed-off-by: Anna Schumaker --- net/sunrpc/rpc_pipe.c | 8 ++++---- net/sunrpc/rpcb_clnt.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 61a504fb1ae2..7803f3b6aa53 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1410,8 +1410,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return PTR_ERR(gssd_dentry); } - dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", - net, NET_NAME(net)); + dprintk("RPC: sending pipefs MOUNT notification for net %x%s\n", + net->ns.inum, NET_NAME(net)); mutex_lock(&sn->pipefs_sb_lock); sn->pipefs_sb = sb; err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, @@ -1462,8 +1462,8 @@ static void rpc_kill_sb(struct super_block *sb) goto out; } sn->pipefs_sb = NULL; - dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", - net, NET_NAME(net)); + dprintk("RPC: sending pipefs UMOUNT notification for net %x%s\n", + net->ns.inum, NET_NAME(net)); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index ea0676f199c8..c526f8fb37c9 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -216,9 +216,9 @@ static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt, smp_wmb(); sn->rpcb_users = 1; dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: " - "%p, rpcb_local_clnt4: %p) for net %p%s\n", - sn->rpcb_local_clnt, sn->rpcb_local_clnt4, - net, (net == &init_net) ? " (init_net)" : ""); + "%p, rpcb_local_clnt4: %p) for net %x%s\n", + sn->rpcb_local_clnt, sn->rpcb_local_clnt4, + net->ns.inum, (net == &init_net) ? " (init_net)" : ""); } /* -- cgit v1.2.3 From 17e48577752a61197a7e5f4922db45b3a5af5068 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Thu, 16 Nov 2017 12:03:34 -0500 Subject: net/netlabel: Add list_next_rcu() in rcu_dereference(). Add list_next_rcu() for fetching next list in rcu_deference safely. Found with sparse in linux-next tree on tag next-20171116. Signed-off-by: Tim Hansen Signed-off-by: David S. Miller --- net/netlabel/netlabel_addrlist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h index d0f38bc9af6d..ac709f0f197b 100644 --- a/net/netlabel/netlabel_addrlist.h +++ b/net/netlabel/netlabel_addrlist.h @@ -87,7 +87,7 @@ static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s, struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { - i = rcu_dereference(i->next); + i = rcu_dereference(list_next_rcu(i)); n = __af4list_entry(i); } return n; @@ -154,7 +154,7 @@ static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s, struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { - i = rcu_dereference(i->next); + i = rcu_dereference(list_next_rcu(i)); n = __af6list_entry(i); } return n; -- cgit v1.2.3 From ecca8f88da5c4260cc2bccfefd2a24976704c366 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 17 Nov 2017 14:11:11 +0800 Subject: sctp: set frag_point in sctp_setsockopt_maxseg correctly Now in sctp_setsockopt_maxseg user_frag or frag_point can be set with val >= 8 and val <= SCTP_MAX_CHUNK_LEN. But both checks are incorrect. val >= 8 means frag_point can even be less than SCTP_DEFAULT_MINSEGMENT. Then in sctp_datamsg_from_user(), when it's value is greater than cookie echo len and trying to bundle with cookie echo chunk, the first_len will overflow. The worse case is when it's value is equal as cookie echo len, first_len becomes 0, it will go into a dead loop for fragment later on. In Hangbin syzkaller testing env, oom was even triggered due to consecutive memory allocation in that loop. Besides, SCTP_MAX_CHUNK_LEN is the max size of the whole chunk, it should deduct the data header for frag_point or user_frag check. This patch does a proper check with SCTP_DEFAULT_MINSEGMENT subtracting the sctphdr and datahdr, SCTP_MAX_CHUNK_LEN subtracting datahdr when setting frag_point via sockopt. It also improves sctp_setsockopt_maxseg codes. Suggested-by: Marcelo Ricardo Leitner Reported-by: Hangbin Liu Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 4c0a77292792..3204a9b29407 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3140,9 +3140,9 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign */ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen) { + struct sctp_sock *sp = sctp_sk(sk); struct sctp_assoc_value params; struct sctp_association *asoc; - struct sctp_sock *sp = sctp_sk(sk); int val; if (optlen == sizeof(int)) { @@ -3158,26 +3158,35 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; val = params.assoc_value; - } else + } else { return -EINVAL; + } - if ((val != 0) && ((val < 8) || (val > SCTP_MAX_CHUNK_LEN))) - return -EINVAL; + if (val) { + int min_len, max_len; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id && sctp_style(sk, UDP)) - return -EINVAL; + min_len = SCTP_DEFAULT_MINSEGMENT - sp->pf->af->net_header_len; + min_len -= sizeof(struct sctphdr) + + sizeof(struct sctp_data_chunk); + + max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk); + if (val < min_len || val > max_len) + return -EINVAL; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); if (asoc) { if (val == 0) { - val = asoc->pathmtu; - val -= sp->pf->af->net_header_len; + val = asoc->pathmtu - sp->pf->af->net_header_len; val -= sizeof(struct sctphdr) + - sizeof(struct sctp_data_chunk); + sizeof(struct sctp_data_chunk); } asoc->user_frag = val; asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); } else { + if (params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; sp->user_frag = val; } -- cgit v1.2.3 From e39d5246111399dbc6e11cd39fd8580191b86c47 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 17 Nov 2017 14:27:06 +0800 Subject: route: update fnhe_expires for redirect when the fnhe exists Now when creating fnhe for redirect, it sets fnhe_expires for this new route cache. But when updating the exist one, it doesn't do it. It will cause this fnhe never to be expired. Paolo already noticed it before, in Jianlin's test case, it became even worse: When ip route flush cache, the old fnhe is not to be removed, but only clean it's members. When redirect comes again, this fnhe will be found and updated, but never be expired due to fnhe_expires not being set. So fix it by simply updating fnhe_expires even it's for redirect. Fixes: aee06da6726d ("ipv4: use seqlock for nh_exceptions") Reported-by: Jianlin Shi Acked-by: Hannes Frederic Sowa Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/route.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3b427757b1f8..11cf2fe43308 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -678,10 +678,9 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, if (fnhe) { if (gw) fnhe->fnhe_gw = gw; - if (pmtu) { + if (pmtu) fnhe->fnhe_pmtu = pmtu; - fnhe->fnhe_expires = max(1UL, expires); - } + fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) -- cgit v1.2.3 From cebe84c6190d741045a322f5343f717139993c08 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 17 Nov 2017 14:27:18 +0800 Subject: route: also update fnhe_genid when updating a route cache Now when ip route flush cache and it turn out all fnhe_genid != genid. If a redirect/pmtu icmp packet comes and the old fnhe is found and all it's members but fnhe_genid will be updated. Then next time when it looks up route and tries to rebind this fnhe to the new dst, the fnhe will be flushed due to fnhe_genid != genid. It causes this redirect/pmtu icmp packet acutally not to be applied. This patch is to also reset fnhe_genid when updating a route cache. Fixes: 5aad1de5ea2c ("ipv4: use separate genid for next hop exceptions") Acked-by: Hannes Frederic Sowa Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/route.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 11cf2fe43308..43b69af242e1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -651,9 +651,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; + u32 genid, hval; unsigned int i; int depth; - u32 hval = fnhe_hashfun(daddr); + + genid = fnhe_genid(dev_net(nh->nh_dev)); + hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); @@ -676,6 +679,8 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, } if (fnhe) { + if (fnhe->fnhe_genid != genid) + fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; if (pmtu) @@ -699,7 +704,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_next = hash->chain; rcu_assign_pointer(hash->chain, fnhe); } - fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); + fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; -- cgit v1.2.3 From 981542c526ecd846920bc500e9989da906ee9fb9 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 17 Nov 2017 19:16:17 +0300 Subject: gre6: use log_ecn_error module parameter in ip6_tnl_rcv() After commit 308edfdf1563 ("gre6: Cleanup GREv6 receive path, call common GRE functions") it's not used anywhere in the module, but previously was used in ip6gre_rcv(). Fixes: 308edfdf1563 ("gre6: Cleanup GREv6 receive path, call common GRE functions") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b90bad7a4e56..4cfd8e0696fe 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -460,7 +460,7 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { - ip6_tnl_rcv(tunnel, skb, tpi, NULL, false); + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); return PACKET_RCVD; } -- cgit v1.2.3 From ed66dfaf236c04d414de1d218441296e57fb2bd2 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 17 Nov 2017 21:06:14 -0500 Subject: tcp: when scheduling TLP, time of RTO should account for current ACK Fix the TLP scheduling logic so that when scheduling a TLP probe, we ensure that the estimated time at which an RTO would fire accounts for the fact that ACKs indicating forward progress should push back RTO times. After the following fix: df92c8394e6e ("tcp: fix xmit timer to only be reset if data ACKed/SACKed") we had an unintentional behavior change in the following kind of scenario: suppose the RTT variance has been very low recently. Then suppose we send out a flight of N packets and our RTT is 100ms: t=0: send a flight of N packets t=100ms: receive an ACK for N-1 packets The response before df92c8394e6e that was: -> schedule a TLP for now + RTO_interval The response after df92c8394e6e is: -> schedule a TLP for t=0 + RTO_interval Since RTO_interval = srtt + RTT_variance, this means that we have scheduled a TLP timer at a point in the future that only accounts for RTT_variance. If the RTT_variance term is small, this means that the timer fires soon. Before df92c8394e6e this would not happen, because in that code, when we receive an ACK for a prefix of flight, we did: 1) Near the top of tcp_ack(), switch from TLP timer to RTO at write_queue_head->paket_tx_time + RTO_interval: if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); 2) In tcp_clean_rtx_queue(), update the RTO to now + RTO_interval: if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); 3) In tcp_ack() after tcp_fastretrans_alert() switch from RTO to TLP at now + RTO_interval: if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); In df92c8394e6e we removed that 3-phase dance, and instead directly set the TLP timer once: we set the TLP timer in cases like this to write_queue_head->packet_tx_time + RTO_interval. So if the RTT variance is small, then this means that this is setting the TLP timer to fire quite soon. This means if the ACK for the tail of the flight takes longer than an RTT to arrive (often due to delayed ACKs), then the TLP timer fires too quickly. Fixes: df92c8394e6e ("tcp: fix xmit timer to only be reset if data ACKed/SACKed") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f844c06c0676..734cfc8ff76e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2964,7 +2964,7 @@ void tcp_rearm_rto(struct sock *sk) /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ static void tcp_set_xmit_timer(struct sock *sk) { - if (!tcp_schedule_loss_probe(sk)) + if (!tcp_schedule_loss_probe(sk, true)) tcp_rearm_rto(sk); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 540b7d92cc70..a4d214c7b506 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2391,7 +2391,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) - tcp_schedule_loss_probe(sk); + tcp_schedule_loss_probe(sk, false); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); tcp_cwnd_validate(sk, is_cwnd_limited); return false; @@ -2399,7 +2399,7 @@ repair: return !tp->packets_out && !tcp_write_queue_empty(sk); } -bool tcp_schedule_loss_probe(struct sock *sk) +bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2440,7 +2440,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) } /* If the RTO formula yields an earlier time, then use that time. */ - rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ + rto_delta_us = advancing_rto ? + jiffies_to_usecs(inet_csk(sk)->icsk_rto) : + tcp_rto_delta_us(sk); /* How far in future is RTO? */ if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); -- cgit v1.2.3 From 61b272c3aa170b3e461b8df636407b29f35f98eb Mon Sep 17 00:00:00 2001 From: Tuomas Tynkkynen Date: Sun, 19 Nov 2017 11:28:43 +0200 Subject: 9p: Fix missing commas in mount options Since commit c4fac9100456 ("9p: Implement show_options"), the mount options of 9p filesystems are printed out with some missing commas between the individual options: p9-scratch on /mnt/scratch type 9p (rw,dirsync,loose,access=clienttrans=virtio) Add them back. Cc: stable@vger.kernel.org # 4.13+ Fixes: c4fac9100456 ("9p: Implement show_options") Signed-off-by: Tuomas Tynkkynen Signed-off-by: Al Viro --- net/9p/client.c | 2 +- net/9p/trans_fd.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 1beb131dd3e1..b433aff5ff13 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -82,7 +82,7 @@ int p9_show_client_options(struct seq_file *m, struct p9_client *clnt) { if (clnt->msize != 8192) seq_printf(m, ",msize=%u", clnt->msize); - seq_printf(m, "trans=%s", clnt->trans_mod->name); + seq_printf(m, ",trans=%s", clnt->trans_mod->name); switch (clnt->proto_version) { case p9_proto_legacy: diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 903a190319b9..985046ae4231 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -724,12 +724,12 @@ static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt) { if (clnt->trans_mod == &p9_tcp_trans) { if (clnt->trans_opts.tcp.port != P9_PORT) - seq_printf(m, "port=%u", clnt->trans_opts.tcp.port); + seq_printf(m, ",port=%u", clnt->trans_opts.tcp.port); } else if (clnt->trans_mod == &p9_fd_trans) { if (clnt->trans_opts.fd.rfd != ~0) - seq_printf(m, "rfd=%u", clnt->trans_opts.fd.rfd); + seq_printf(m, ",rfd=%u", clnt->trans_opts.fd.rfd); if (clnt->trans_opts.fd.wfd != ~0) - seq_printf(m, "wfd=%u", clnt->trans_opts.fd.wfd); + seq_printf(m, ",wfd=%u", clnt->trans_opts.fd.wfd); } return 0; } -- cgit v1.2.3 From 07dc8bc9a6b15f54d3ad962af74a096c7d7b42b4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 7 Nov 2017 10:08:01 +0000 Subject: netfilter: remove redundant assignment to e The assignment to variable e is redundant since the same assignment occurs just a few lines later, hence it can be removed. Cleans up clang warning for arp_tables, ip_tables and ip6_tables: warning: Value stored to 'e' is never read Signed-off-by: Colin Ian King Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 1 - net/ipv4/netfilter/ip_tables.c | 1 - net/ipv6/netfilter/ip6_tables.c | 1 - 3 files changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f88221aebc9d..0c3c944a7b72 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -373,7 +373,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4cbe5e80f3bf..2e0d339028bb 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -439,7 +439,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index f06e25065a34..1d7ae9366335 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -458,7 +458,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; -- cgit v1.2.3 From 613d0776d3fe7eb28c695a63a5533a1ec8258c86 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 14:32:37 +0300 Subject: netfilter: exit_net cleanup check added Be sure that lists initialized in net_init hook was return to initial state. Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 + net/netfilter/nf_tables_api.c | 7 +++++++ net/netfilter/nfnetlink_log.c | 5 +++++ net/netfilter/nfnetlink_queue.c | 5 +++++ net/netfilter/x_tables.c | 9 +++++++++ 5 files changed, 27 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 17b4ca562944..e35b8d074f06 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -819,6 +819,7 @@ static void clusterip_net_exit(struct net *net) cn->procdir = NULL; #endif nf_unregister_net_hook(net, &cip_arp_ops); + WARN_ON_ONCE(!list_empty(&cn->configs)); } static struct pernet_operations clusterip_net_ops = { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d8327b43e4dc..10798b357481 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5847,6 +5847,12 @@ static int __net_init nf_tables_init_net(struct net *net) return 0; } +static void __net_exit nf_tables_exit_net(struct net *net) +{ + WARN_ON_ONCE(!list_empty(&net->nft.af_info)); + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); +} + int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; @@ -5917,6 +5923,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, + .exit = nf_tables_exit_net, }; static int __init nf_tables_module_init(void) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index cad6498f10b0..1f511ed0fea3 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1093,10 +1093,15 @@ static int __net_init nfnl_log_net_init(struct net *net) static void __net_exit nfnl_log_net_exit(struct net *net) { + struct nfnl_log_net *log = nfnl_log_pernet(net); + unsigned int i; + #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); #endif nf_log_unset(net, &nfulnl_logger); + for (i = 0; i < INSTANCE_BUCKETS; i++) + WARN_ON_ONCE(!hlist_empty(&log->instance_table[i])); } static struct pernet_operations nfnl_log_net_ops = { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index a16356cacec3..c09b36755ed7 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1512,10 +1512,15 @@ static int __net_init nfnl_queue_net_init(struct net *net) static void __net_exit nfnl_queue_net_exit(struct net *net) { + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + unsigned int i; + nf_unregister_queue_handler(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif + for (i = 0; i < INSTANCE_BUCKETS; i++) + WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a77dd514297c..55802e97f906 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1729,8 +1729,17 @@ static int __net_init xt_net_init(struct net *net) return 0; } +static void __net_exit xt_net_exit(struct net *net) +{ + int i; + + for (i = 0; i < NFPROTO_NUMPROTO; i++) + WARN_ON_ONCE(!list_empty(&net->xt.tables[i])); +} + static struct pernet_operations xt_net_ops = { .init = xt_net_init, + .exit = xt_net_exit, }; static int __init xt_init(void) -- cgit v1.2.3 From bc7d811ace4ad39a3941089ca871633366878719 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Mon, 13 Nov 2017 09:09:40 +0100 Subject: netfilter: nf_ct_h323: Convert CHECK_BOUND macro to function It is bad practive to return in a macro, this patch moves the check into a function. Signed-off-by: Eric Sesterhenn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_asn1.c | 94 +++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index cf1bf2605c10..3d9a009ac147 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -103,7 +103,6 @@ struct bitstr { #define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} #define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} #define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} -#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) static unsigned int get_len(struct bitstr *bs); static unsigned int get_bit(struct bitstr *bs); static unsigned int get_bits(struct bitstr *bs, unsigned int b); @@ -165,6 +164,14 @@ static unsigned int get_len(struct bitstr *bs) return v; } +static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes) +{ + if (*bs->cur + bytes > *bs->end) + return 1; + + return 0; +} + /****************************************************************************/ static unsigned int get_bit(struct bitstr *bs) { @@ -280,7 +287,8 @@ static int decode_bool(struct bitstr *bs, const struct field_t *f, INC_BIT(bs); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -293,11 +301,14 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; + len = *bs->cur++; bs->cur += len; + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; - CHECK_BOUND(bs, 0); return H323_ERROR_NONE; } @@ -330,7 +341,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, break; case UNCO: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); bs->cur += len; break; @@ -341,7 +353,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -357,7 +370,8 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f, INC_BITS(bs, f->sz); } - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -375,12 +389,14 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, len = f->lb; break; case WORD: /* 2-byte length */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = (*bs->cur++) << 8; len += (*bs->cur++) + f->lb; break; case SEMI: - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); break; default: @@ -391,7 +407,8 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, bs->cur += len >> 3; bs->bit = len & 7; - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -409,7 +426,8 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f, BYTE_ALIGN(bs); INC_BITS(bs, (len << 2)); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -440,12 +458,14 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, break; case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; case SEMI: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs) + f->lb; break; default: /* 2 <= Range <= 255 */ @@ -458,7 +478,8 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -473,7 +494,8 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; default: /* 2 <= Range <= 255 */ @@ -484,7 +506,8 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, bs->cur += len << 1; - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -525,9 +548,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Decode */ if (son->attr & OPEN) { /* Open field */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -556,7 +581,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Get the extension bitmap */ bmp2_len = get_bits(bs, 7) + 1; - CHECK_BOUND(bs, (bmp2_len + 7) >> 3); + if (nf_h323_error_boundary(bs, (bmp2_len + 7) >> 3)) + return H323_ERROR_BOUND; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; if (base) @@ -567,9 +593,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, for (opt = 0; opt < bmp2_len; opt++, i++, son++) { /* Check Range */ if (i >= f->ub) { /* Newer Version? */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; bs->cur += len; continue; } @@ -583,9 +611,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, if (!((0x80000000 >> opt) & bmp2)) /* Not present */ continue; - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -623,19 +653,22 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; count = *bs->cur++; break; case WORD: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; count = *bs->cur++; count <<= 8; count += *bs->cur++; break; case SEMI: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; count = get_len(bs); break; default: @@ -659,7 +692,8 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, if (son->attr & OPEN) { BYTE_ALIGN(bs); len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -728,7 +762,8 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (type >= f->ub) { /* Newer version? */ BYTE_ALIGN(bs); len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; bs->cur += len; return H323_ERROR_NONE; } @@ -743,7 +778,8 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (ext || (son->attr & OPEN)) { BYTE_ALIGN(bs); len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); -- cgit v1.2.3 From ec8a8f3c31ddef0a7d9626c4b8a4baa30f3b80aa Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Mon, 13 Nov 2017 09:09:41 +0100 Subject: netfilter: nf_ct_h323: Extend nf_h323_error_boundary to work on bits as well This patch fixes several out of bounds memory reads by extending the nf_h323_error_boundary() function to work on bits as well an check the affected parts. Signed-off-by: Eric Sesterhenn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_asn1.c | 92 +++++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 3d9a009ac147..dc6347342e34 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -164,8 +164,13 @@ static unsigned int get_len(struct bitstr *bs) return v; } -static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes) +static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes, size_t bits) { + bits += bs->bit; + bytes += bits / BITS_PER_BYTE; + if (bits % BITS_PER_BYTE > 0) + bytes++; + if (*bs->cur + bytes > *bs->end) return 1; @@ -286,8 +291,7 @@ static int decode_bool(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); INC_BIT(bs); - - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -301,12 +305,12 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; len = *bs->cur++; bs->cur += len; - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; @@ -330,6 +334,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, bs->cur += 2; break; case CONS: /* 64K < Range < 4G */ + if (nf_h323_error_boundary(bs, 0, 2)) + return H323_ERROR_BOUND; len = get_bits(bs, 2) + 1; BYTE_ALIGN(bs); if (base && (f->attr & DECODE)) { /* timeToLive */ @@ -341,7 +347,7 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, break; case UNCO: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); bs->cur += len; @@ -353,7 +359,7 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -370,7 +376,7 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f, INC_BITS(bs, f->sz); } - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -389,13 +395,13 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, len = f->lb; break; case WORD: /* 2-byte length */ - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = (*bs->cur++) << 8; len += (*bs->cur++) + f->lb; break; case SEMI: - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); break; @@ -407,7 +413,7 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, bs->cur += len >> 3; bs->bit = len & 7; - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -421,12 +427,14 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); INC_BITS(bs, (len << 2)); - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -458,17 +466,19 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, break; case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; case SEMI: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs) + f->lb; break; default: /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); break; @@ -478,7 +488,7 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -494,11 +504,13 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; default: /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); break; @@ -506,7 +518,7 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, bs->cur += len << 1; - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -526,9 +538,13 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; /* Extensible? */ + if (nf_h323_error_boundary(bs, 0, 1)) + return H323_ERROR_BOUND; ext = (f->attr & EXT) ? get_bit(bs) : 0; /* Get fields bitmap */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; bmp = get_bitmap(bs, f->sz); if (base) *(unsigned int *)base = bmp; @@ -548,10 +564,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Decode */ if (son->attr & OPEN) { /* Open field */ - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, @@ -580,8 +596,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, return H323_ERROR_NONE; /* Get the extension bitmap */ + if (nf_h323_error_boundary(bs, 0, 7)) + return H323_ERROR_BOUND; bmp2_len = get_bits(bs, 7) + 1; - if (nf_h323_error_boundary(bs, (bmp2_len + 7) >> 3)) + if (nf_h323_error_boundary(bs, 0, bmp2_len)) return H323_ERROR_BOUND; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; @@ -593,10 +611,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, for (opt = 0; opt < bmp2_len; opt++, i++, son++) { /* Check Range */ if (i >= f->ub) { /* Newer Version? */ - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; bs->cur += len; continue; @@ -611,10 +629,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, if (!((0x80000000 >> opt) & bmp2)) /* Not present */ continue; - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", @@ -653,13 +671,13 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; count = *bs->cur++; break; case WORD: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; count = *bs->cur++; count <<= 8; @@ -667,11 +685,13 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, break; case SEMI: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; count = get_len(bs); break; default: + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; count = get_bits(bs, f->sz); break; } @@ -691,8 +711,10 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, for (i = 0; i < count; i++) { if (son->attr & OPEN) { BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, @@ -744,11 +766,17 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; /* Decode the choice index number */ + if (nf_h323_error_boundary(bs, 0, 1)) + return H323_ERROR_BOUND; if ((f->attr & EXT) && get_bit(bs)) { ext = 1; + if (nf_h323_error_boundary(bs, 0, 7)) + return H323_ERROR_BOUND; type = get_bits(bs, 7) + f->lb; } else { ext = 0; + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; type = get_bits(bs, f->sz); if (type >= f->lb) return H323_ERROR_RANGE; @@ -761,8 +789,10 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, /* Check Range */ if (type >= f->ub) { /* Newer version? */ BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; bs->cur += len; return H323_ERROR_NONE; @@ -777,8 +807,10 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (ext || (son->attr & OPEN)) { BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", -- cgit v1.2.3 From fbcd253d2448b8f168241e38f629a36c4c8c1e94 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 19 Nov 2017 21:27:28 +0100 Subject: netfilter: conntrack: lower timeout to RETRANS seconds if window is 0 When zero window is announced we can get into a situation where connection stays around forever: 1. One side announces zero window. 2. Other side closes. In this case, no FIN is sent (stuck in send queue). Unless other side opens the window up again conntrack stays in ESTABLISHED state for a very long time. Lets alleviate this by lowering the timeout to RETRANS (5 minutes), the other end should be sending zero window probes to keep the connection established as long as a socket still exists. Cc: Jozsef Kadlecsik Signed-off-by: Florian Westphal Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b12fc07111d0..37ef35b861f2 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1039,6 +1039,9 @@ static int tcp_packet(struct nf_conn *ct, IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; + else if (ct->proto.tcp.last_win == 0 && + timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) + timeout = timeouts[TCP_CONNTRACK_RETRANS]; else timeout = timeouts[new_state]; spin_unlock_bh(&ct->lock); -- cgit v1.2.3 From 34f11cd329580fe4c3e8f10081d687331fc710f3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 16:35:49 -0700 Subject: mac80211: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Johannes Berg Cc: "David S. Miller" Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 7 +++---- net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/led.c | 11 ++++++----- net/mac80211/main.c | 3 +-- net/mac80211/mesh.c | 27 ++++++++++++--------------- net/mac80211/mesh.h | 2 +- net/mac80211/mesh_hwmp.c | 4 ++-- net/mac80211/mesh_pathtbl.c | 3 +-- net/mac80211/mlme.c | 32 ++++++++++++++------------------ net/mac80211/ocb.c | 10 +++++----- net/mac80211/sta_info.c | 7 +++---- 11 files changed, 50 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index e9c6aa3ed05b..db07e0de9a03 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1711,10 +1711,10 @@ void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata) sdata_unlock(sdata); } -static void ieee80211_ibss_timer(unsigned long data) +static void ieee80211_ibss_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.ibss.timer); ieee80211_queue_work(&sdata->local->hw, &sdata->work); } @@ -1723,8 +1723,7 @@ void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; - setup_timer(&ifibss->timer, ieee80211_ibss_timer, - (unsigned long) sdata); + timer_setup(&ifibss->timer, ieee80211_ibss_timer, 0); INIT_LIST_HEAD(&ifibss->incomplete_stations); spin_lock_init(&ifibss->incomplete_lock); INIT_WORK(&ifibss->csa_connection_drop_work, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 68f874e73561..885d00b41911 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1057,6 +1057,7 @@ struct tpt_led_trigger { const struct ieee80211_tpt_blink *blink_table; unsigned int blink_table_len; struct timer_list timer; + struct ieee80211_local *local; unsigned long prev_traffic; unsigned long tx_bytes, rx_bytes; unsigned int active, want; @@ -1932,7 +1933,7 @@ static inline int ieee80211_ac_from_tid(int tid) void ieee80211_dynamic_ps_enable_work(struct work_struct *work); void ieee80211_dynamic_ps_disable_work(struct work_struct *work); -void ieee80211_dynamic_ps_timer(unsigned long data); +void ieee80211_dynamic_ps_timer(struct timer_list *t); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave); diff --git a/net/mac80211/led.c b/net/mac80211/led.c index 0505845b7ab8..ba0b507ea691 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -248,10 +248,10 @@ static unsigned long tpt_trig_traffic(struct ieee80211_local *local, return DIV_ROUND_UP(delta, 1024 / 8); } -static void tpt_trig_timer(unsigned long data) +static void tpt_trig_timer(struct timer_list *t) { - struct ieee80211_local *local = (void *)data; - struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; + struct tpt_led_trigger *tpt_trig = from_timer(tpt_trig, t, timer); + struct ieee80211_local *local = tpt_trig->local; struct led_classdev *led_cdev; unsigned long on, off, tpt; int i; @@ -306,8 +306,9 @@ __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; tpt_trig->want = flags; + tpt_trig->local = local; - setup_timer(&tpt_trig->timer, tpt_trig_timer, (unsigned long)local); + timer_setup(&tpt_trig->timer, tpt_trig_timer, 0); local->tpt_led_trigger = tpt_trig; @@ -326,7 +327,7 @@ static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) tpt_trig_traffic(local, tpt_trig); tpt_trig->running = true; - tpt_trig_timer((unsigned long)local); + tpt_trig_timer(&tpt_trig->timer); mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 8aa1f5b6a051..e054a2fd8d38 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -633,8 +633,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, ieee80211_dynamic_ps_enable_work); INIT_WORK(&local->dynamic_ps_disable_work, ieee80211_dynamic_ps_disable_work); - setup_timer(&local->dynamic_ps_timer, - ieee80211_dynamic_ps_timer, (unsigned long) local); + timer_setup(&local->dynamic_ps_timer, ieee80211_dynamic_ps_timer, 0); INIT_WORK(&local->sched_scan_stopped_work, ieee80211_sched_scan_stopped_work); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 7a76c4a6df30..5e27364e10ac 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -37,9 +37,10 @@ void ieee80211s_stop(void) kmem_cache_destroy(rm_cache); } -static void ieee80211_mesh_housekeeping_timer(unsigned long data) +static void ieee80211_mesh_housekeeping_timer(struct timer_list *t) { - struct ieee80211_sub_if_data *sdata = (void *) data; + struct ieee80211_sub_if_data *sdata = + from_timer(sdata, t, u.mesh.housekeeping_timer); struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; @@ -528,18 +529,18 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, return 0; } -static void ieee80211_mesh_path_timer(unsigned long data) +static void ieee80211_mesh_path_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mesh.mesh_path_timer); ieee80211_queue_work(&sdata->local->hw, &sdata->work); } -static void ieee80211_mesh_path_root_timer(unsigned long data) +static void ieee80211_mesh_path_root_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mesh.mesh_path_root_timer); struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags); @@ -1442,9 +1443,8 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; static u8 zero_addr[ETH_ALEN] = {}; - setup_timer(&ifmsh->housekeeping_timer, - ieee80211_mesh_housekeeping_timer, - (unsigned long) sdata); + timer_setup(&ifmsh->housekeeping_timer, + ieee80211_mesh_housekeeping_timer, 0); ifmsh->accepting_plinks = true; atomic_set(&ifmsh->mpaths, 0); @@ -1458,12 +1458,9 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) mesh_pathtbl_init(sdata); - setup_timer(&ifmsh->mesh_path_timer, - ieee80211_mesh_path_timer, - (unsigned long) sdata); - setup_timer(&ifmsh->mesh_path_root_timer, - ieee80211_mesh_path_root_timer, - (unsigned long) sdata); + timer_setup(&ifmsh->mesh_path_timer, ieee80211_mesh_path_timer, 0); + timer_setup(&ifmsh->mesh_path_root_timer, + ieee80211_mesh_path_root_timer, 0); INIT_LIST_HEAD(&ifmsh->preq_queue.list); skb_queue_head_init(&ifmsh->ps.bc_buf); spin_lock_init(&ifmsh->mesh_preq_queue_lock); diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 465b7853edc0..ee56f18cad3f 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -296,7 +296,7 @@ void mesh_path_tx_pending(struct mesh_path *mpath); int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata); void mesh_pathtbl_unregister(struct ieee80211_sub_if_data *sdata); int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr); -void mesh_path_timer(unsigned long data); +void mesh_path_timer(struct timer_list *t); void mesh_path_flush_by_nexthop(struct sta_info *sta); void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 146ec6c0f12f..4f7826d7b47c 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1194,9 +1194,9 @@ endlookup: return err; } -void mesh_path_timer(unsigned long data) +void mesh_path_timer(struct timer_list *t) { - struct mesh_path *mpath = (void *) data; + struct mesh_path *mpath = from_timer(mpath, t, timer); struct ieee80211_sub_if_data *sdata = mpath->sdata; int ret; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 97269caafecd..86c8dfef56a4 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -399,8 +399,7 @@ struct mesh_path *mesh_path_new(struct ieee80211_sub_if_data *sdata, skb_queue_head_init(&new_mpath->frame_queue); new_mpath->exp_time = jiffies; spin_lock_init(&new_mpath->state_lock); - setup_timer(&new_mpath->timer, mesh_path_timer, - (unsigned long) new_mpath); + timer_setup(&new_mpath->timer, mesh_path_timer, 0); return new_mpath; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index e4ededa1909d..04460440d731 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1066,10 +1066,10 @@ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success) } EXPORT_SYMBOL(ieee80211_chswitch_done); -static void ieee80211_chswitch_timer(unsigned long data) +static void ieee80211_chswitch_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mgd.chswitch_timer); ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.chswitch_work); } @@ -1577,9 +1577,9 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) } } -void ieee80211_dynamic_ps_timer(unsigned long data) +void ieee80211_dynamic_ps_timer(struct timer_list *t) { - struct ieee80211_local *local = (void *) data; + struct ieee80211_local *local = from_timer(local, t, dynamic_ps_timer); ieee80211_queue_work(&local->hw, &local->dynamic_ps_enable_work); } @@ -3711,10 +3711,10 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, sdata_unlock(sdata); } -static void ieee80211_sta_timer(unsigned long data) +static void ieee80211_sta_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mgd.timer); ieee80211_queue_work(&sdata->local->hw, &sdata->work); } @@ -3991,10 +3991,10 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) sdata_unlock(sdata); } -static void ieee80211_sta_bcn_mon_timer(unsigned long data) +static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mgd.bcn_mon_timer); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn) @@ -4005,10 +4005,10 @@ static void ieee80211_sta_bcn_mon_timer(unsigned long data) &sdata->u.mgd.beacon_connection_loss_work); } -static void ieee80211_sta_conn_mon_timer(unsigned long data) +static void ieee80211_sta_conn_mon_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mgd.conn_mon_timer); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; @@ -4139,14 +4139,10 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_mgd_work); INIT_DELAYED_WORK(&ifmgd->tdls_peer_del_work, ieee80211_tdls_peer_del_work); - setup_timer(&ifmgd->timer, ieee80211_sta_timer, - (unsigned long) sdata); - setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, - (unsigned long) sdata); - setup_timer(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, - (unsigned long) sdata); - setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer, - (unsigned long) sdata); + timer_setup(&ifmgd->timer, ieee80211_sta_timer, 0); + timer_setup(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, 0); + timer_setup(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, 0); + timer_setup(&ifmgd->chswitch_timer, ieee80211_chswitch_timer, 0); INIT_DELAYED_WORK(&ifmgd->tx_tspec_wk, ieee80211_sta_handle_tspec_ac_params_wk); diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index 88e6ebbbe24f..d351dc1162be 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -150,9 +150,10 @@ void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata) sdata_unlock(sdata); } -static void ieee80211_ocb_housekeeping_timer(unsigned long data) +static void ieee80211_ocb_housekeeping_timer(struct timer_list *t) { - struct ieee80211_sub_if_data *sdata = (void *)data; + struct ieee80211_sub_if_data *sdata = + from_timer(sdata, t, u.ocb.housekeeping_timer); struct ieee80211_local *local = sdata->local; struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; @@ -165,9 +166,8 @@ void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; - setup_timer(&ifocb->housekeeping_timer, - ieee80211_ocb_housekeeping_timer, - (unsigned long)sdata); + timer_setup(&ifocb->housekeeping_timer, + ieee80211_ocb_housekeeping_timer, 0); INIT_LIST_HEAD(&ifocb->incomplete_stations); spin_lock_init(&ifocb->incomplete_lock); } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index a3060e55122c..0e50065e3433 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1064,9 +1064,9 @@ int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, return ret; } -static void sta_info_cleanup(unsigned long data) +static void sta_info_cleanup(struct timer_list *t) { - struct ieee80211_local *local = (struct ieee80211_local *) data; + struct ieee80211_local *local = from_timer(local, t, sta_cleanup); struct sta_info *sta; bool timer_needed = false; @@ -1098,8 +1098,7 @@ int sta_info_init(struct ieee80211_local *local) mutex_init(&local->sta_mtx); INIT_LIST_HEAD(&local->sta_list); - setup_timer(&local->sta_cleanup, sta_info_cleanup, - (unsigned long)local); + timer_setup(&local->sta_cleanup, sta_info_cleanup, 0); return 0; } -- cgit v1.2.3 From 44905265bc155e0237c76c25bf5ddf740d85a8f2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 17 Oct 2017 21:56:01 +0200 Subject: nl80211: don't expose wdev->ssid for most interfaces For mesh, this is simply wrong - there's no SSID, only the mesh ID, so don't expose it at all. For (P2P) client, it's wrong, because it exposes an internal value that's only used when certain APIs are used. For AP, it's actually the only correct case, so leave that. All other interface types shouldn't be setting this anyway, so there it won't change anything. Fixes: b84e7a05f619 ("nl80211: send the NL80211_ATTR_SSID in nl80211_send_iface()") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a0e1951227fa..b1ac23ca20c8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2605,10 +2605,32 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag goto nla_put_failure; } - if (wdev->ssid_len) { - if (nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) + wdev_lock(wdev); + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + if (wdev->ssid_len && + nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) goto nla_put_failure; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_ADHOC: { + const u8 *ssid_ie; + if (!wdev->current_bss) + break; + ssid_ie = ieee80211_bss_get_ie(&wdev->current_bss->pub, + WLAN_EID_SSID); + if (!ssid_ie) + break; + if (nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) + goto nla_put_failure; + break; + } + default: + /* nothing */ + break; } + wdev_unlock(wdev); genlmsg_end(msg, hdr); return 0; -- cgit v1.2.3 From 7cca2acdff2d7c53b4a553756e731693152115d4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Oct 2017 13:25:45 -0700 Subject: mac80211: aggregation: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This removes the tid mapping array and expands the tid structures to add a pointer back to the station, along with the tid index itself. Cc: Johannes Berg Cc: "David S. Miller" Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook [switch tid variables to u8, the valid range is 0-15 at most, initialize tid_tx->sta/tid properly] Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 41 +++++++++++++++++------------------------ net/mac80211/agg-tx.c | 44 ++++++++++++++++++-------------------------- net/mac80211/sta_info.c | 8 -------- net/mac80211/sta_info.h | 12 ++++++++++-- 4 files changed, 45 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 88cc1ae935ea..d444752dbf40 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -151,21 +151,17 @@ EXPORT_SYMBOL(ieee80211_stop_rx_ba_session); * After accepting the AddBA Request we activated a timer, * resetting it after each frame that arrives from the originator. */ -static void sta_rx_agg_session_timer_expired(unsigned long data) +static void sta_rx_agg_session_timer_expired(struct timer_list *t) { - /* not an elegant detour, but there is no choice as the timer passes - * only one argument, and various sta_info are needed here, so init - * flow in sta_info_create gives the TID as data, while the timer_to_id - * array gives the sta through container_of */ - u8 *ptid = (u8 *)data; - u8 *timer_to_id = ptid - *ptid; - struct sta_info *sta = container_of(timer_to_id, struct sta_info, - timer_to_tid[0]); + struct tid_ampdu_rx *tid_rx_timer = + from_timer(tid_rx_timer, t, session_timer); + struct sta_info *sta = tid_rx_timer->sta; + u8 tid = tid_rx_timer->tid; struct tid_ampdu_rx *tid_rx; unsigned long timeout; rcu_read_lock(); - tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[*ptid]); + tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); if (!tid_rx) { rcu_read_unlock(); return; @@ -180,21 +176,18 @@ static void sta_rx_agg_session_timer_expired(unsigned long data) rcu_read_unlock(); ht_dbg(sta->sdata, "RX session timer expired on %pM tid %d\n", - sta->sta.addr, (u16)*ptid); + sta->sta.addr, tid); - set_bit(*ptid, sta->ampdu_mlme.tid_rx_timer_expired); + set_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired); ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work); } -static void sta_rx_agg_reorder_timer_expired(unsigned long data) +static void sta_rx_agg_reorder_timer_expired(struct timer_list *t) { - u8 *ptid = (u8 *)data; - u8 *timer_to_id = ptid - *ptid; - struct sta_info *sta = container_of(timer_to_id, struct sta_info, - timer_to_tid[0]); + struct tid_ampdu_rx *tid_rx = from_timer(tid_rx, t, reorder_timer); rcu_read_lock(); - ieee80211_release_reorder_timeout(sta, *ptid); + ieee80211_release_reorder_timeout(tid_rx->sta, tid_rx->tid); rcu_read_unlock(); } @@ -356,14 +349,12 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, spin_lock_init(&tid_agg_rx->reorder_lock); /* rx timer */ - setup_deferrable_timer(&tid_agg_rx->session_timer, - sta_rx_agg_session_timer_expired, - (unsigned long)&sta->timer_to_tid[tid]); + timer_setup(&tid_agg_rx->session_timer, + sta_rx_agg_session_timer_expired, TIMER_DEFERRABLE); /* rx reorder timer */ - setup_timer(&tid_agg_rx->reorder_timer, - sta_rx_agg_reorder_timer_expired, - (unsigned long)&sta->timer_to_tid[tid]); + timer_setup(&tid_agg_rx->reorder_timer, + sta_rx_agg_reorder_timer_expired, 0); /* prepare reordering buffer */ tid_agg_rx->reorder_buf = @@ -399,6 +390,8 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, tid_agg_rx->auto_seq = auto_seq; tid_agg_rx->started = false; tid_agg_rx->reorder_buf_filtered = 0; + tid_agg_rx->tid = tid; + tid_agg_rx->sta = sta; status = WLAN_STATUS_SUCCESS; /* activate it for RX */ diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index bef516ec47f9..3680b380e70c 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -422,15 +422,12 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, * add Block Ack response will arrive from the recipient. * If this timer expires sta_addba_resp_timer_expired will be executed. */ -static void sta_addba_resp_timer_expired(unsigned long data) +static void sta_addba_resp_timer_expired(struct timer_list *t) { - /* not an elegant detour, but there is no choice as the timer passes - * only one argument, and both sta_info and TID are needed, so init - * flow in sta_info_create gives the TID as data, while the timer_to_id - * array gives the sta through container_of */ - u16 tid = *(u8 *)data; - struct sta_info *sta = container_of((void *)data, - struct sta_info, timer_to_tid[tid]); + struct tid_ampdu_tx *tid_tx_timer = + from_timer(tid_tx_timer, t, addba_resp_timer); + struct sta_info *sta = tid_tx_timer->sta; + u8 tid = tid_tx_timer->tid; struct tid_ampdu_tx *tid_tx; /* check if the TID waits for addBA response */ @@ -525,21 +522,17 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) * After accepting the AddBA Response we activated a timer, * resetting it after each frame that we send. */ -static void sta_tx_agg_session_timer_expired(unsigned long data) +static void sta_tx_agg_session_timer_expired(struct timer_list *t) { - /* not an elegant detour, but there is no choice as the timer passes - * only one argument, and various sta_info are needed here, so init - * flow in sta_info_create gives the TID as data, while the timer_to_id - * array gives the sta through container_of */ - u8 *ptid = (u8 *)data; - u8 *timer_to_id = ptid - *ptid; - struct sta_info *sta = container_of(timer_to_id, struct sta_info, - timer_to_tid[0]); + struct tid_ampdu_tx *tid_tx_timer = + from_timer(tid_tx_timer, t, session_timer); + struct sta_info *sta = tid_tx_timer->sta; + u8 tid = tid_tx_timer->tid; struct tid_ampdu_tx *tid_tx; unsigned long timeout; rcu_read_lock(); - tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[*ptid]); + tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx || test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { rcu_read_unlock(); return; @@ -555,9 +548,9 @@ static void sta_tx_agg_session_timer_expired(unsigned long data) rcu_read_unlock(); ht_dbg(sta->sdata, "tx session timer expired on %pM tid %d\n", - sta->sta.addr, (u16)*ptid); + sta->sta.addr, tid); - ieee80211_stop_tx_ba_session(&sta->sta, *ptid); + ieee80211_stop_tx_ba_session(&sta->sta, tid); } int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, @@ -670,16 +663,15 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, __set_bit(HT_AGG_STATE_WANT_START, &tid_tx->state); tid_tx->timeout = timeout; + tid_tx->sta = sta; + tid_tx->tid = tid; /* response timer */ - setup_timer(&tid_tx->addba_resp_timer, - sta_addba_resp_timer_expired, - (unsigned long)&sta->timer_to_tid[tid]); + timer_setup(&tid_tx->addba_resp_timer, sta_addba_resp_timer_expired, 0); /* tx timer */ - setup_deferrable_timer(&tid_tx->session_timer, - sta_tx_agg_session_timer_expired, - (unsigned long)&sta->timer_to_tid[tid]); + timer_setup(&tid_tx->session_timer, + sta_tx_agg_session_timer_expired, TIMER_DEFERRABLE); /* assign a dialog token */ sta->ampdu_mlme.dialog_token_allocator++; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 0e50065e3433..0c5627f8a104 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -379,14 +379,6 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - /* - * timer_to_tid must be initialized with identity mapping - * to enable session_timer's data differentiation. See - * sta_rx_agg_session_timer_expired for usage. - */ - sta->timer_to_tid[i] = i; - } for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 5c54acd10562..cd53619435b6 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -126,6 +126,8 @@ enum ieee80211_agg_stop_reason { AGG_STOP_DESTROY_STA, }; +struct sta_info; + /** * struct tid_ampdu_tx - TID aggregation information (Tx). * @@ -133,8 +135,10 @@ enum ieee80211_agg_stop_reason { * @session_timer: check if we keep Tx-ing on the TID (by timeout value) * @addba_resp_timer: timer for peer's response to addba request * @pending: pending frames queue -- use sta's spinlock to protect + * @sta: station we are attached to * @dialog_token: dialog token for aggregation session * @timeout: session timeout value to be filled in ADDBA requests + * @tid: TID number * @state: session state (see above) * @last_tx: jiffies of last tx activity * @stop_initiator: initiator of a session stop @@ -158,6 +162,7 @@ struct tid_ampdu_tx { struct timer_list session_timer; struct timer_list addba_resp_timer; struct sk_buff_head pending; + struct sta_info *sta; unsigned long state; unsigned long last_tx; u16 timeout; @@ -169,6 +174,7 @@ struct tid_ampdu_tx { u16 failed_bar_ssn; bool bar_pending; bool amsdu; + u8 tid; }; /** @@ -181,12 +187,14 @@ struct tid_ampdu_tx { * @reorder_time: jiffies when skb was added * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) * @reorder_timer: releases expired frames from the reorder buffer. + * @sta: station we are attached to * @last_rx: jiffies of last rx activity * @head_seq_num: head sequence number in reordering buffer. * @stored_mpdu_num: number of MPDUs in reordering buffer * @ssn: Starting Sequence Number expected to be aggregated. * @buf_size: buffer size for incoming A-MPDUs * @timeout: reset timer value (in TUs). + * @tid: TID number * @rcu_head: RCU head used for freeing this struct * @reorder_lock: serializes access to reorder buffer, see below. * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and @@ -208,6 +216,7 @@ struct tid_ampdu_rx { u64 reorder_buf_filtered; struct sk_buff_head *reorder_buf; unsigned long *reorder_time; + struct sta_info *sta; struct timer_list session_timer; struct timer_list reorder_timer; unsigned long last_rx; @@ -216,6 +225,7 @@ struct tid_ampdu_rx { u16 ssn; u16 buf_size; u16 timeout; + u8 tid; u8 auto_seq:1, removed:1, started:1; @@ -447,7 +457,6 @@ struct ieee80211_sta_rx_stats { * plus one for non-QoS frames) * @tid_seq: per-TID sequence numbers for sending to this STA * @ampdu_mlme: A-MPDU state machine state - * @timer_to_tid: identity mapping to ID timers * @mesh: mesh STA information * @debugfs_dir: debug filesystem directory dentry * @dead: set to true when sta is unlinked @@ -554,7 +563,6 @@ struct sta_info { * Aggregation information, locked with lock. */ struct sta_ampdu_mlme ampdu_mlme; - u8 timer_to_tid[IEEE80211_NUM_TIDS]; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; -- cgit v1.2.3 From d7be102f2945a626f55e0501e52bb31ba3e77b81 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Oct 2017 11:24:27 +0200 Subject: cfg80211: initialize regulatory keys/database later When cfg80211 is built as a module, everything is fine, and we can keep the code as is; in fact, we have to, because there can only be a single module_init(). When cfg80211 is built-in, however, it needs to initialize before drivers (device_initcall/module_init), and thus used to be at subsys_initcall(). I'd moved it to fs_initcall() earlier, where it can remain. However, this is still too early because at that point the key infrastructure hasn't been initialized yet, so X.509 certificates can't be parsed yet. To work around this problem, load the regdb keys only later in a late_initcall(), at which point the necessary infrastructure has been initialized. Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Reported-by: Xiaolong Ye Signed-off-by: Johannes Berg --- net/wireless/reg.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3871998059de..78e71b0390be 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3644,27 +3644,14 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy, } } -int __init regulatory_init(void) +static int __init regulatory_init_db(void) { - int err = 0; + int err; err = load_builtin_regdb_keys(); if (err) return err; - reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); - if (IS_ERR(reg_pdev)) - return PTR_ERR(reg_pdev); - - spin_lock_init(®_requests_lock); - spin_lock_init(®_pending_beacons_lock); - spin_lock_init(®_indoor_lock); - - rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); - - user_alpha2[0] = '9'; - user_alpha2[1] = '7'; - /* We always try to get an update for the static regdomain */ err = regulatory_hint_core(cfg80211_world_regdom->alpha2); if (err) { @@ -3692,6 +3679,31 @@ int __init regulatory_init(void) return 0; } +#ifndef MODULE +late_initcall(regulatory_init_db); +#endif + +int __init regulatory_init(void) +{ + reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); + if (IS_ERR(reg_pdev)) + return PTR_ERR(reg_pdev); + + spin_lock_init(®_requests_lock); + spin_lock_init(®_pending_beacons_lock); + spin_lock_init(®_indoor_lock); + + rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); + + user_alpha2[0] = '9'; + user_alpha2[1] = '7'; + +#ifdef MODULE + return regulatory_init_db(); +#else + return 0; +#endif +} void regulatory_exit(void) { -- cgit v1.2.3 From 33ddd81e2bd5d9970b9f01ab383ba45035fa41ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 13 Nov 2017 11:33:55 +0100 Subject: mac80211: properly free requested-but-not-started TX agg sessions When deleting a station or otherwise tearing down all aggregation sessions, make sure to delete requested but not yet started ones, to avoid the following scenario: * session is requested, added to tid_start_tx[] * ieee80211_ba_session_work() runs, gets past BLOCK_BA check * ieee80211_sta_tear_down_BA_sessions() runs, locks &sta->ampdu_mlme.mtx, e.g. while deleting the station - deleting all active sessions * ieee80211_ba_session_work() continues since tear down flushes it, and calls ieee80211_tx_ba_session_handle_start() for the new session, arms the timer for it * station deletion continues to __cleanup_single_sta() and frees the session struct, while the timer is armed Reported-by: Fengguang Wu Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 3680b380e70c..5f8ab5be369f 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -330,6 +330,11 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, spin_lock_bh(&sta->lock); + /* free struct pending for start, if present */ + tid_tx = sta->ampdu_mlme.tid_start_tx[tid]; + kfree(tid_tx); + sta->ampdu_mlme.tid_start_tx[tid] = NULL; + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); if (!tid_tx) { spin_unlock_bh(&sta->lock); -- cgit v1.2.3 From 288b3de55aace830f13280985ec9e6bcbff33b1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:54 -0800 Subject: bpf: offload: move offload device validation out to the drivers With TC shared block changes we can't depend on correct netdev pointer being available in cls_bpf. Move the device validation to the driver. Core will only make sure that offloaded programs are always attached in the driver (or in HW by the driver). We trust that drivers which implement offload callbacks will perform necessary checks. Moving the checks to the driver is generally a useful thing, in practice the check should be against a switchdev instance, not a netdev, given that most ASICs will probably allow using the same program on many ports. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Jiri Pirko Signed-off-by: Daniel Borkmann --- net/core/dev.c | 7 ++----- net/sched/cls_bpf.c | 8 +++----- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8ee29f4f5fa9..09525a27319c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7139,11 +7139,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, __dev_xdp_attached(dev, bpf_op, NULL)) return -EBUSY; - if (bpf_op == ops->ndo_bpf) - prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, - dev); - else - prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + bpf_op == ops->ndo_bpf); if (IS_ERR(prog)) return PTR_ERR(prog); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index fb680dafac5a..a9f3e317055c 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -382,15 +382,13 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, { struct bpf_prog *fp; char *name = NULL; + bool skip_sw; u32 bpf_fd; bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); + skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW; - if (gen_flags & TCA_CLS_FLAGS_SKIP_SW) - fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, - qdisc_dev(tp->q)); - else - fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); + fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw); if (IS_ERR(fp)) return PTR_ERR(fp); -- cgit v1.2.3 From 441a33031fe5a3e828fbc17a4f9a5bab9143243c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:55 -0800 Subject: net: xdp: don't allow device-bound programs in driver mode Currently device-bound programs are not able to run on the host to save resources (host JIT is not invoked). Don't allow XDP programs to be attached without the HW_MODE flag. In theory if program is already translated for device offload the driver should choose to offload it instead of loading it in the driver. However, offloading translated program may still fail resulting in device-bound program being run on the host. Prevent this by refusing to attach device bound programs if XDP_FLAGS_HW_MODE is not set. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- net/core/dev.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 09525a27319c..5e2ba133fba7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7143,6 +7143,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_op == ops->ndo_bpf); if (IS_ERR(prog)) return PTR_ERR(prog); + + if (!(flags & XDP_FLAGS_HW_MODE) && + bpf_prog_is_dev_bound(prog->aux)) { + NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported"); + bpf_prog_put(prog); + return -EINVAL; + } } err = dev_xdp_install(dev, bpf_op, extack, flags, prog); -- cgit v1.2.3 From e0e853ac036f76fcad3995554b8b6cef555b010f Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 20 Nov 2017 21:43:03 +0100 Subject: tipc: fix access of released memory When the function tipc_group_filter_msg() finds that a member event indicates that the member is leaving the group, it first deletes the member instance, and then purges the message queue being handled by the call. But the message queue is an aggregated field in the just deleted item, leading the purge call to access freed memory. We fix this by swapping the order of the two actions. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 7821085a7dd8..12777cac638a 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -539,8 +539,8 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); if (leave) { - tipc_group_delete_member(grp, m); __skb_queue_purge(defq); + tipc_group_delete_member(grp, m); break; } if (!update) -- cgit v1.2.3 From 1e9aa74ecd84ac3eeffb7a9f08f9021d7df6486a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Aug 2017 12:41:43 -0700 Subject: net/atm/mpc: Avoid open-coded assignment of timer callback function Instead of a single function assignment, just fold this into DEFINE_TIMER(). Cc: "David S. Miller" Signed-off-by: Kees Cook --- net/atm/mpc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/atm/mpc.c b/net/atm/mpc.c index e882d8b5db05..7c6a1cc760a2 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -121,7 +121,7 @@ static struct notifier_block mpoa_notifier = { struct mpoa_client *mpcs = NULL; /* FIXME */ static struct atm_mpoa_qos *qos_head = NULL; -static DEFINE_TIMER(mpc_timer, NULL); +static DEFINE_TIMER(mpc_timer, mpc_cache_check); static struct mpoa_client *find_mpc_by_itfnum(int itf) @@ -1413,7 +1413,6 @@ static void mpc_timer_refresh(void) { mpc_timer.expires = jiffies + (MPC_P2 * HZ); checking_time = mpc_timer.expires; - mpc_timer.function = (TIMER_FUNC_TYPE)mpc_cache_check; add_timer(&mpc_timer); } -- cgit v1.2.3 From 24ed960abf1d50cb7834e99a0cfc081bc0656712 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 28 Aug 2017 11:28:21 -0700 Subject: treewide: Switch DEFINE_TIMER callbacks to struct timer_list * This changes all DEFINE_TIMER() callbacks to use a struct timer_list pointer instead of unsigned long. Since the data argument has already been removed, none of these callbacks are using their argument currently, so this renames the argument to "unused". Done using the following semantic patch: @match_define_timer@ declarer name DEFINE_TIMER; identifier _timer, _callback; @@ DEFINE_TIMER(_timer, _callback); @change_callback depends on match_define_timer@ identifier match_define_timer._callback; type _origtype; identifier _origarg; @@ void -_callback(_origtype _origarg) +_callback(struct timer_list *unused) { ... } Signed-off-by: Kees Cook --- net/decnet/dn_route.c | 4 ++-- net/ipv6/ip6_flowlabel.c | 4 ++-- net/netrom/nr_loopback.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index b36dceab0dc1..de4a0cafb19f 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -125,7 +125,7 @@ static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); static int dn_route_input(struct sk_buff *); -static void dn_run_flush(unsigned long dummy); +static void dn_run_flush(struct timer_list *unused); static struct dn_rt_hash_bucket *dn_rt_hash_table; static unsigned int dn_rt_hash_mask; @@ -357,7 +357,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou return 0; } -static void dn_run_flush(unsigned long dummy) +static void dn_run_flush(struct timer_list *unused) { int i; struct dn_route *rt, *next; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 9f2e73c71768..7f59c8fabeeb 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -46,7 +46,7 @@ static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; -static void ip6_fl_gc(unsigned long dummy); +static void ip6_fl_gc(struct timer_list *unused); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ @@ -127,7 +127,7 @@ static void fl_release(struct ip6_flowlabel *fl) spin_unlock_bh(&ip6_fl_lock); } -static void ip6_fl_gc(unsigned long dummy) +static void ip6_fl_gc(struct timer_list *unused) { int i; unsigned long now = jiffies; diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index 989ae647825e..215ad22a9647 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -15,7 +15,7 @@ #include #include -static void nr_loopback_timer(unsigned long); +static void nr_loopback_timer(struct timer_list *); static struct sk_buff_head loopback_queue; static DEFINE_TIMER(loopback_timer, nr_loopback_timer); @@ -48,7 +48,7 @@ int nr_loopback_queue(struct sk_buff *skb) return 1; } -static void nr_loopback_timer(unsigned long param) +static void nr_loopback_timer(struct timer_list *unused) { struct sk_buff *skb; ax25_address *nr_dest; -- cgit v1.2.3 From e99e88a9d2b067465adaa9c111ada99a041bef9a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 14:43:17 -0700 Subject: treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook --- net/802/garp.c | 6 +++--- net/802/mrp.c | 13 ++++++------- net/appletalk/aarp.c | 4 ++-- net/appletalk/ddp.c | 7 +++---- net/batman-adv/tp_meter.c | 14 ++++++-------- net/bluetooth/hidp/core.c | 7 +++---- net/bluetooth/rfcomm/core.c | 12 ++++++------ net/bluetooth/sco.c | 6 +++--- net/core/drop_monitor.c | 7 +++---- net/core/gen_estimator.c | 6 +++--- net/core/neighbour.c | 14 +++++++------- net/decnet/dn_route.c | 4 ++-- net/decnet/dn_timer.c | 8 ++++---- net/ipv4/igmp.c | 20 +++++++++----------- net/ipv4/ipmr.c | 9 ++++----- net/ipv6/addrconf.c | 9 ++++----- net/ipv6/ip6mr.c | 9 ++++----- net/ipv6/mcast.c | 33 +++++++++++++++------------------ net/ncsi/ncsi-manage.c | 8 +++----- net/netfilter/nf_conntrack_expect.c | 7 +++---- net/netfilter/nfnetlink_log.c | 8 ++++---- net/netfilter/xt_IDLETIMER.c | 7 +++---- net/netfilter/xt_LED.c | 8 ++++---- net/nfc/nci/core.c | 14 ++++++-------- net/rxrpc/call_object.c | 7 +++---- net/wireless/lib80211.c | 11 ++++++----- net/x25/x25_link.c | 8 ++++---- net/xfrm/xfrm_state.c | 9 ++++----- 28 files changed, 127 insertions(+), 148 deletions(-) (limited to 'net') diff --git a/net/802/garp.c b/net/802/garp.c index 2dac647ff420..7f50d47470bd 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -401,9 +401,9 @@ static void garp_join_timer_arm(struct garp_applicant *app) mod_timer(&app->join_timer, jiffies + delay); } -static void garp_join_timer(unsigned long data) +static void garp_join_timer(struct timer_list *t) { - struct garp_applicant *app = (struct garp_applicant *)data; + struct garp_applicant *app = from_timer(app, t, join_timer); spin_lock(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); @@ -584,7 +584,7 @@ int garp_init_applicant(struct net_device *dev, struct garp_application *appl) spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->garp_port->applicants[appl->type], app); - setup_timer(&app->join_timer, garp_join_timer, (unsigned long)app); + timer_setup(&app->join_timer, garp_join_timer, 0); garp_join_timer_arm(app); return 0; diff --git a/net/802/mrp.c b/net/802/mrp.c index be4dd3165347..a808dd5bbb27 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -586,9 +586,9 @@ static void mrp_join_timer_arm(struct mrp_applicant *app) mod_timer(&app->join_timer, jiffies + delay); } -static void mrp_join_timer(unsigned long data) +static void mrp_join_timer(struct timer_list *t) { - struct mrp_applicant *app = (struct mrp_applicant *)data; + struct mrp_applicant *app = from_timer(app, t, join_timer); spin_lock(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); @@ -605,9 +605,9 @@ static void mrp_periodic_timer_arm(struct mrp_applicant *app) jiffies + msecs_to_jiffies(mrp_periodic_time)); } -static void mrp_periodic_timer(unsigned long data) +static void mrp_periodic_timer(struct timer_list *t) { - struct mrp_applicant *app = (struct mrp_applicant *)data; + struct mrp_applicant *app = from_timer(app, t, periodic_timer); spin_lock(&app->lock); mrp_mad_event(app, MRP_EVENT_PERIODIC); @@ -865,10 +865,9 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); - setup_timer(&app->join_timer, mrp_join_timer, (unsigned long)app); + timer_setup(&app->join_timer, mrp_join_timer, 0); mrp_join_timer_arm(app); - setup_timer(&app->periodic_timer, mrp_periodic_timer, - (unsigned long)app); + timer_setup(&app->periodic_timer, mrp_periodic_timer, 0); mrp_periodic_timer_arm(app); return 0; diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 8ad3ec2610b6..309d7dbb36e8 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -310,7 +310,7 @@ static void __aarp_expire_device(struct aarp_entry **n, struct net_device *dev) } /* Handle the timer event */ -static void aarp_expire_timeout(unsigned long unused) +static void aarp_expire_timeout(struct timer_list *unused) { int ct; @@ -884,7 +884,7 @@ void __init aarp_proto_init(void) aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv); if (!aarp_dl) printk(KERN_CRIT "Unable to register AARP with SNAP.\n"); - setup_timer(&aarp_timer, aarp_expire_timeout, 0); + timer_setup(&aarp_timer, aarp_expire_timeout, 0); aarp_timer.expires = jiffies + sysctl_aarp_expiry_time; add_timer(&aarp_timer); register_netdevice_notifier(&aarp_notifier); diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 5d035c1f1156..03a9fc0771c0 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -158,9 +158,9 @@ found: return s; } -static void atalk_destroy_timer(unsigned long data) +static void atalk_destroy_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct sock *sk = from_timer(sk, t, sk_timer); if (sk_has_allocations(sk)) { sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; @@ -175,8 +175,7 @@ static inline void atalk_destroy_socket(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); if (sk_has_allocations(sk)) { - setup_timer(&sk->sk_timer, atalk_destroy_timer, - (unsigned long)sk); + timer_setup(&sk->sk_timer, atalk_destroy_timer, 0); sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; add_timer(&sk->sk_timer); } else diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 4b90033f35a8..15cd2139381e 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -488,9 +488,9 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) * Switch to Slow Start, set the ss_threshold to half of the current cwnd and * reset the cwnd to 3*MSS */ -static void batadv_tp_sender_timeout(unsigned long arg) +static void batadv_tp_sender_timeout(struct timer_list *t) { - struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg; + struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer); struct batadv_priv *bat_priv = tp_vars->bat_priv; if (atomic_read(&tp_vars->sending) == 0) @@ -1020,8 +1020,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, atomic64_set(&tp_vars->tot_sent, 0); kref_get(&tp_vars->refcount); - setup_timer(&tp_vars->timer, batadv_tp_sender_timeout, - (unsigned long)tp_vars); + timer_setup(&tp_vars->timer, batadv_tp_sender_timeout, 0); tp_vars->bat_priv = bat_priv; tp_vars->start_time = jiffies; @@ -1109,9 +1108,9 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars) * reached without received ack * @arg: address of the related tp_vars */ -static void batadv_tp_receiver_shutdown(unsigned long arg) +static void batadv_tp_receiver_shutdown(struct timer_list *t) { - struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg; + struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer); struct batadv_tp_unacked *un, *safe; struct batadv_priv *bat_priv; @@ -1373,8 +1372,7 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv, hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list); kref_get(&tp_vars->refcount); - setup_timer(&tp_vars->timer, batadv_tp_receiver_shutdown, - (unsigned long)tp_vars); + timer_setup(&tp_vars->timer, batadv_tp_receiver_shutdown, 0); batadv_tp_reset_receiver_timer(tp_vars); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 8112893037bd..f2cec70d520c 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -398,9 +398,9 @@ static int hidp_raw_request(struct hid_device *hid, unsigned char reportnum, } } -static void hidp_idle_timeout(unsigned long arg) +static void hidp_idle_timeout(struct timer_list *t) { - struct hidp_session *session = (struct hidp_session *) arg; + struct hidp_session *session = from_timer(session, t, timer); /* The HIDP user-space API only contains calls to add and remove * devices. There is no way to forward events of any kind. Therefore, @@ -944,8 +944,7 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, /* device management */ INIT_WORK(&session->dev_init, hidp_session_dev_work); - setup_timer(&session->timer, hidp_idle_timeout, - (unsigned long)session); + timer_setup(&session->timer, hidp_idle_timeout, 0); /* session data */ mutex_init(&session->report_mutex); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 4a0b41d75c84..b98225d65e87 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -233,9 +233,9 @@ static int rfcomm_check_security(struct rfcomm_dlc *d) d->out); } -static void rfcomm_session_timeout(unsigned long arg) +static void rfcomm_session_timeout(struct timer_list *t) { - struct rfcomm_session *s = (void *) arg; + struct rfcomm_session *s = from_timer(s, t, timer); BT_DBG("session %p state %ld", s, s->state); @@ -258,9 +258,9 @@ static void rfcomm_session_clear_timer(struct rfcomm_session *s) } /* ---- RFCOMM DLCs ---- */ -static void rfcomm_dlc_timeout(unsigned long arg) +static void rfcomm_dlc_timeout(struct timer_list *t) { - struct rfcomm_dlc *d = (void *) arg; + struct rfcomm_dlc *d = from_timer(d, t, timer); BT_DBG("dlc %p state %ld", d, d->state); @@ -307,7 +307,7 @@ struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio) if (!d) return NULL; - setup_timer(&d->timer, rfcomm_dlc_timeout, (unsigned long)d); + timer_setup(&d->timer, rfcomm_dlc_timeout, 0); skb_queue_head_init(&d->tx_queue); mutex_init(&d->lock); @@ -650,7 +650,7 @@ static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state) BT_DBG("session %p sock %p", s, sock); - setup_timer(&s->timer, rfcomm_session_timeout, (unsigned long) s); + timer_setup(&s->timer, rfcomm_session_timeout, 0); INIT_LIST_HEAD(&s->dlcs); s->state = state; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 795e920a3281..08df57665e1f 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -73,9 +73,9 @@ struct sco_pinfo { #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) -static void sco_sock_timeout(unsigned long arg) +static void sco_sock_timeout(struct timer_list *t) { - struct sock *sk = (struct sock *)arg; + struct sock *sk = from_timer(sk, t, sk_timer); BT_DBG("sock %p state %d", sk, sk->sk_state); @@ -487,7 +487,7 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; - setup_timer(&sk->sk_timer, sco_sock_timeout, (unsigned long)sk); + timer_setup(&sk->sk_timer, sco_sock_timeout, 0); bt_sock_link(&sco_sk_list, sk); return sk; diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 70ccda233bd1..c7785efeea57 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -144,9 +144,9 @@ static void send_dm_alert(struct work_struct *work) * in the event that more drops will arrive during the * hysteresis period. */ -static void sched_send_work(unsigned long _data) +static void sched_send_work(struct timer_list *t) { - struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data; + struct per_cpu_dm_data *data = from_timer(data, t, send_timer); schedule_work(&data->dm_alert_work); } @@ -412,8 +412,7 @@ static int __init init_net_drop_monitor(void) for_each_possible_cpu(cpu) { data = &per_cpu(dm_cpu_data, cpu); INIT_WORK(&data->dm_alert_work, send_dm_alert); - setup_timer(&data->send_timer, sched_send_work, - (unsigned long)data); + timer_setup(&data->send_timer, sched_send_work, 0); spin_lock_init(&data->lock); reset_per_cpu_data(data); } diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 7c1ffd6f9501..9834cfa21b21 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -76,9 +76,9 @@ static void est_fetch_counters(struct net_rate_estimator *e, } -static void est_timer(unsigned long arg) +static void est_timer(struct timer_list *t) { - struct net_rate_estimator *est = (struct net_rate_estimator *)arg; + struct net_rate_estimator *est = from_timer(est, t, timer); struct gnet_stats_basic_packed b; u64 rate, brate; @@ -170,7 +170,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, } est->next_jiffies = jiffies + ((HZ/4) << intvl_log); - setup_timer(&est->timer, est_timer, (unsigned long)est); + timer_setup(&est->timer, est_timer, 0); mod_timer(&est->timer, est->next_jiffies); rcu_assign_pointer(*rate_est, est); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6ea3a1a7f36a..d1f5fe986edd 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -51,7 +51,7 @@ do { \ #define PNEIGH_HASHMASK 0xF -static void neigh_timer_handler(unsigned long arg); +static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); @@ -331,7 +331,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device n->output = neigh_blackhole; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); - setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); + timer_setup(&n->timer, neigh_timer_handler, 0); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; @@ -903,10 +903,10 @@ static void neigh_probe(struct neighbour *neigh) /* Called when a timer expires for a neighbour entry. */ -static void neigh_timer_handler(unsigned long arg) +static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; - struct neighbour *neigh = (struct neighbour *)arg; + struct neighbour *neigh = from_timer(neigh, t, timer); unsigned int state; int notify = 0; @@ -1391,9 +1391,9 @@ int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) } EXPORT_SYMBOL(neigh_direct_output); -static void neigh_proxy_process(unsigned long arg) +static void neigh_proxy_process(struct timer_list *t) { - struct neigh_table *tbl = (struct neigh_table *)arg; + struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; @@ -1573,7 +1573,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); - setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); + timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index de4a0cafb19f..324cb9f2f551 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -183,7 +183,7 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } -static void dn_dst_check_expire(unsigned long dummy) +static void dn_dst_check_expire(struct timer_list *unused) { int i; struct dn_route *rt; @@ -1875,7 +1875,7 @@ void __init dn_route_init(void) kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); dst_entries_init(&dn_dst_ops); - setup_timer(&dn_route_timer, dn_dst_check_expire, 0); + timer_setup(&dn_route_timer, dn_dst_check_expire, 0); dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c index f430daed24a0..aa4155875ca8 100644 --- a/net/decnet/dn_timer.c +++ b/net/decnet/dn_timer.c @@ -34,11 +34,11 @@ #define SLOW_INTERVAL (HZ/2) -static void dn_slow_timer(unsigned long arg); +static void dn_slow_timer(struct timer_list *t); void dn_start_slow_timer(struct sock *sk) { - setup_timer(&sk->sk_timer, dn_slow_timer, (unsigned long)sk); + timer_setup(&sk->sk_timer, dn_slow_timer, 0); sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL); } @@ -47,9 +47,9 @@ void dn_stop_slow_timer(struct sock *sk) sk_stop_timer(sk, &sk->sk_timer); } -static void dn_slow_timer(unsigned long arg) +static void dn_slow_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)arg; + struct sock *sk = from_timer(sk, t, sk_timer); struct dn_scp *scp = DN_SK(sk); bh_lock_sock(sk); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index ab183af0b5b6..d1f8f302dbf3 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -752,18 +752,18 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, return ip_local_out(net, skb->sk, skb); } -static void igmp_gq_timer_expire(unsigned long data) +static void igmp_gq_timer_expire(struct timer_list *t) { - struct in_device *in_dev = (struct in_device *)data; + struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer); in_dev->mr_gq_running = 0; igmpv3_send_report(in_dev, NULL); in_dev_put(in_dev); } -static void igmp_ifc_timer_expire(unsigned long data) +static void igmp_ifc_timer_expire(struct timer_list *t) { - struct in_device *in_dev = (struct in_device *)data; + struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer); igmpv3_send_cr(in_dev); if (in_dev->mr_ifc_count) { @@ -784,9 +784,9 @@ static void igmp_ifc_event(struct in_device *in_dev) } -static void igmp_timer_expire(unsigned long data) +static void igmp_timer_expire(struct timer_list *t) { - struct ip_mc_list *im = (struct ip_mc_list *)data; + struct ip_mc_list *im = from_timer(im, t, timer); struct in_device *in_dev = im->interface; spin_lock(&im->lock); @@ -1385,7 +1385,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST - setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); + timer_setup(&im->timer, igmp_timer_expire, 0); im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; #endif @@ -1695,10 +1695,8 @@ void ip_mc_init_dev(struct in_device *in_dev) ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST - setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, - (unsigned long)in_dev); - setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, - (unsigned long)in_dev); + timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); + timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 40a43ad294cb..fd5f19c988e4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -112,7 +112,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); static void mroute_clean_tables(struct mr_table *mrt, bool all); -static void ipmr_expire_process(unsigned long arg); +static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES #define ipmr_for_each_table(mrt, net) \ @@ -375,8 +375,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); - setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, - (unsigned long)mrt); + timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); mrt->mroute_reg_vif_num = -1; #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -804,9 +803,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) } /* Timer process for the unresolved queue. */ -static void ipmr_expire_process(unsigned long arg) +static void ipmr_expire_process(struct timer_list *t) { - struct mr_table *mrt = (struct mr_table *)arg; + struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); unsigned long now; unsigned long expires; struct mfc_cache *c, *next; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a0ae1c9d37df..f49bd7897e95 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -188,7 +188,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp); static void addrconf_dad_work(struct work_struct *w); static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id); static void addrconf_dad_run(struct inet6_dev *idev); -static void addrconf_rs_timer(unsigned long data); +static void addrconf_rs_timer(struct timer_list *t); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); @@ -388,8 +388,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) rwlock_init(&ndev->lock); ndev->dev = dev; INIT_LIST_HEAD(&ndev->addr_list); - setup_timer(&ndev->rs_timer, addrconf_rs_timer, - (unsigned long)ndev); + timer_setup(&ndev->rs_timer, addrconf_rs_timer, 0); memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); if (ndev->cnf.stable_secret.initialized) @@ -3741,9 +3740,9 @@ restart: return 0; } -static void addrconf_rs_timer(unsigned long data) +static void addrconf_rs_timer(struct timer_list *t) { - struct inet6_dev *idev = (struct inet6_dev *)data; + struct inet6_dev *idev = from_timer(idev, t, rs_timer); struct net_device *dev = idev->dev; struct in6_addr lladdr; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 9c24b85949c1..a2e1a864eb46 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -120,7 +120,7 @@ static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr6_table *mrt, bool all); -static void ipmr_expire_process(unsigned long arg); +static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ @@ -320,8 +320,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) INIT_LIST_HEAD(&mrt->mfc6_unres_queue); - setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, - (unsigned long)mrt); + timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); #ifdef CONFIG_IPV6_PIMSM_V2 mrt->mroute_reg_vif_num = -1; @@ -888,9 +887,9 @@ static void ipmr_do_expire_process(struct mr6_table *mrt) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } -static void ipmr_expire_process(unsigned long arg) +static void ipmr_expire_process(struct timer_list *t) { - struct mr6_table *mrt = (struct mr6_table *)arg; + struct mr6_table *mrt = from_timer(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 12b7c27ce5ce..fc6d7d143f2c 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -75,10 +75,10 @@ static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); -static void igmp6_timer_handler(unsigned long data); +static void igmp6_timer_handler(struct timer_list *t); -static void mld_gq_timer_expire(unsigned long data); -static void mld_ifc_timer_expire(unsigned long data); +static void mld_gq_timer_expire(struct timer_list *t); +static void mld_ifc_timer_expire(struct timer_list *t); static void mld_ifc_event(struct inet6_dev *idev); static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); @@ -839,7 +839,7 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, if (!mc) return NULL; - setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); + timer_setup(&mc->mca_timer, igmp6_timer_handler, 0); mc->mca_addr = *addr; mc->idev = idev; /* reference taken by caller */ @@ -2083,9 +2083,9 @@ void ipv6_mc_dad_complete(struct inet6_dev *idev) } } -static void mld_dad_timer_expire(unsigned long data) +static void mld_dad_timer_expire(struct timer_list *t) { - struct inet6_dev *idev = (struct inet6_dev *)data; + struct inet6_dev *idev = from_timer(idev, t, mc_dad_timer); mld_send_initial_cr(idev); if (idev->mc_dad_count) { @@ -2432,18 +2432,18 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma) } } -static void mld_gq_timer_expire(unsigned long data) +static void mld_gq_timer_expire(struct timer_list *t) { - struct inet6_dev *idev = (struct inet6_dev *)data; + struct inet6_dev *idev = from_timer(idev, t, mc_gq_timer); idev->mc_gq_running = 0; mld_send_report(idev, NULL); in6_dev_put(idev); } -static void mld_ifc_timer_expire(unsigned long data) +static void mld_ifc_timer_expire(struct timer_list *t) { - struct inet6_dev *idev = (struct inet6_dev *)data; + struct inet6_dev *idev = from_timer(idev, t, mc_ifc_timer); mld_send_cr(idev); if (idev->mc_ifc_count) { @@ -2462,9 +2462,9 @@ static void mld_ifc_event(struct inet6_dev *idev) mld_ifc_start_timer(idev, 1); } -static void igmp6_timer_handler(unsigned long data) +static void igmp6_timer_handler(struct timer_list *t) { - struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + struct ifmcaddr6 *ma = from_timer(ma, t, mca_timer); if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); @@ -2552,14 +2552,11 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) write_lock_bh(&idev->lock); spin_lock_init(&idev->mc_lock); idev->mc_gq_running = 0; - setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire, - (unsigned long)idev); + timer_setup(&idev->mc_gq_timer, mld_gq_timer_expire, 0); idev->mc_tomb = NULL; idev->mc_ifc_count = 0; - setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire, - (unsigned long)idev); - setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, - (unsigned long)idev); + timer_setup(&idev->mc_ifc_timer, mld_ifc_timer_expire, 0); + timer_setup(&idev->mc_dad_timer, mld_dad_timer_expire, 0); ipv6_mc_reset(idev); write_unlock_bh(&idev->lock); } diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index a2b904a718c6..3ccf8dfb233e 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -529,9 +529,9 @@ struct ncsi_dev *ncsi_find_dev(struct net_device *dev) return NULL; } -static void ncsi_request_timeout(unsigned long data) +static void ncsi_request_timeout(struct timer_list *t) { - struct ncsi_request *nr = (struct ncsi_request *)data; + struct ncsi_request *nr = from_timer(nr, t, timer); struct ncsi_dev_priv *ndp = nr->ndp; unsigned long flags; @@ -1577,9 +1577,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) { ndp->requests[i].id = i; ndp->requests[i].ndp = ndp; - setup_timer(&ndp->requests[i].timer, - ncsi_request_timeout, - (unsigned long)&ndp->requests[i]); + timer_setup(&ndp->requests[i].timer, ncsi_request_timeout, 0); } spin_lock_irqsave(&ncsi_dev_lock, flags); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 64778f9a8548..d6748a8a79c5 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -67,9 +67,9 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, } EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); -static void nf_ct_expectation_timed_out(unsigned long ul_expect) +static void nf_ct_expectation_timed_out(struct timer_list *t) { - struct nf_conntrack_expect *exp = (void *)ul_expect; + struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); @@ -368,8 +368,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) /* two references : one for hash insert, one for the timer */ refcount_add(2, &exp->use); - setup_timer(&exp->timeout, nf_ct_expectation_timed_out, - (unsigned long)exp); + timer_setup(&exp->timeout, nf_ct_expectation_timed_out, 0); helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index cad6498f10b0..e5afab86381c 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -151,7 +151,7 @@ instance_put(struct nfulnl_instance *inst) call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu); } -static void nfulnl_timer(unsigned long data); +static void nfulnl_timer(struct timer_list *t); static struct nfulnl_instance * instance_create(struct net *net, u_int16_t group_num, @@ -184,7 +184,7 @@ instance_create(struct net *net, u_int16_t group_num, /* needs to be two, since we _put() after creation */ refcount_set(&inst->use, 2); - setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); + timer_setup(&inst->timer, nfulnl_timer, 0); inst->net = get_net(net); inst->peer_user_ns = user_ns; @@ -377,9 +377,9 @@ __nfulnl_flush(struct nfulnl_instance *inst) } static void -nfulnl_timer(unsigned long data) +nfulnl_timer(struct timer_list *t) { - struct nfulnl_instance *inst = (struct nfulnl_instance *)data; + struct nfulnl_instance *inst = from_timer(inst, t, timer); spin_lock_bh(&inst->lock); if (inst->skb) diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index daf45da448fa..ee3421ad108d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -107,9 +107,9 @@ static void idletimer_tg_work(struct work_struct *work) sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); } -static void idletimer_tg_expired(unsigned long data) +static void idletimer_tg_expired(struct timer_list *t) { - struct idletimer_tg *timer = (struct idletimer_tg *) data; + struct idletimer_tg *timer = from_timer(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); @@ -143,8 +143,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) list_add(&info->timer->entry, &idletimer_tg_list); - setup_timer(&info->timer->timer, idletimer_tg_expired, - (unsigned long) info->timer); + timer_setup(&info->timer->timer, idletimer_tg_expired, 0); info->timer->refcnt = 1; mod_timer(&info->timer->timer, diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 3ba31c194cce..0971634e5444 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -85,9 +85,10 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static void led_timeout_callback(unsigned long data) +static void led_timeout_callback(struct timer_list *t) { - struct xt_led_info_internal *ledinternal = (struct xt_led_info_internal *)data; + struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t, + timer); led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); } @@ -143,8 +144,7 @@ static int led_tg_check(const struct xt_tgchk_param *par) /* See if we need to set up a timer */ if (ledinfo->delay > 0) - setup_timer(&ledinternal->timer, led_timeout_callback, - (unsigned long)ledinternal); + timer_setup(&ledinternal->timer, led_timeout_callback, 0); list_add_tail(&ledinternal->list, &xt_led_triggers); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index c25e9b4179c3..074960154993 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -591,18 +591,18 @@ static int nci_close_device(struct nci_dev *ndev) } /* NCI command timer function */ -static void nci_cmd_timer(unsigned long arg) +static void nci_cmd_timer(struct timer_list *t) { - struct nci_dev *ndev = (void *) arg; + struct nci_dev *ndev = from_timer(ndev, t, cmd_timer); atomic_set(&ndev->cmd_cnt, 1); queue_work(ndev->cmd_wq, &ndev->cmd_work); } /* NCI data exchange timer function */ -static void nci_data_timer(unsigned long arg) +static void nci_data_timer(struct timer_list *t) { - struct nci_dev *ndev = (void *) arg; + struct nci_dev *ndev = from_timer(ndev, t, data_timer); set_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags); queue_work(ndev->rx_wq, &ndev->rx_work); @@ -1232,10 +1232,8 @@ int nci_register_device(struct nci_dev *ndev) skb_queue_head_init(&ndev->rx_q); skb_queue_head_init(&ndev->tx_q); - setup_timer(&ndev->cmd_timer, nci_cmd_timer, - (unsigned long) ndev); - setup_timer(&ndev->data_timer, nci_data_timer, - (unsigned long) ndev); + timer_setup(&ndev->cmd_timer, nci_cmd_timer, 0); + timer_setup(&ndev->data_timer, nci_data_timer, 0); mutex_init(&ndev->req_lock); INIT_LIST_HEAD(&ndev->conn_info_list); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 4c7fbc6dcce7..994dc2df57e4 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -45,9 +45,9 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { struct kmem_cache *rxrpc_call_jar; -static void rxrpc_call_timer_expired(unsigned long _call) +static void rxrpc_call_timer_expired(struct timer_list *t) { - struct rxrpc_call *call = (struct rxrpc_call *)_call; + struct rxrpc_call *call = from_timer(call, t, timer); _enter("%d", call->debug_id); @@ -114,8 +114,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) goto nomem_2; mutex_init(&call->user_mutex); - setup_timer(&call->timer, rxrpc_call_timer_expired, - (unsigned long)call); + timer_setup(&call->timer, rxrpc_call_timer_expired, 0); INIT_WORK(&call->processor, &rxrpc_process_call); INIT_LIST_HEAD(&call->link); INIT_LIST_HEAD(&call->chan_wait_link); diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c index 459611577d3d..801d4781a73b 100644 --- a/net/wireless/lib80211.c +++ b/net/wireless/lib80211.c @@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(lib80211_crypto_lock); static void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info, int force); static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info); -static void lib80211_crypt_deinit_handler(unsigned long data); +static void lib80211_crypt_deinit_handler(struct timer_list *t); int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, spinlock_t *lock) @@ -55,8 +55,8 @@ int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, info->lock = lock; INIT_LIST_HEAD(&info->crypt_deinit_list); - setup_timer(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler, - (unsigned long)info); + timer_setup(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler, + 0); return 0; } @@ -116,9 +116,10 @@ static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info) spin_unlock_irqrestore(info->lock, flags); } -static void lib80211_crypt_deinit_handler(unsigned long data) +static void lib80211_crypt_deinit_handler(struct timer_list *t) { - struct lib80211_crypt_info *info = (struct lib80211_crypt_info *)data; + struct lib80211_crypt_info *info = from_timer(info, t, + crypt_deinit_timer); unsigned long flags; lib80211_crypt_deinit_entries(info, 0); diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index e0cd04d28352..a6a8ab09b914 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -36,7 +36,7 @@ LIST_HEAD(x25_neigh_list); DEFINE_RWLOCK(x25_neigh_list_lock); -static void x25_t20timer_expiry(unsigned long); +static void x25_t20timer_expiry(struct timer_list *); static void x25_transmit_restart_confirmation(struct x25_neigh *nb); static void x25_transmit_restart_request(struct x25_neigh *nb); @@ -49,9 +49,9 @@ static inline void x25_start_t20timer(struct x25_neigh *nb) mod_timer(&nb->t20timer, jiffies + nb->t20); } -static void x25_t20timer_expiry(unsigned long param) +static void x25_t20timer_expiry(struct timer_list *t) { - struct x25_neigh *nb = (struct x25_neigh *)param; + struct x25_neigh *nb = from_timer(nb, t, t20timer); x25_transmit_restart_request(nb); @@ -252,7 +252,7 @@ void x25_link_device_up(struct net_device *dev) return; skb_queue_head_init(&nb->queue); - setup_timer(&nb->t20timer, x25_t20timer_expiry, (unsigned long)nb); + timer_setup(&nb->t20timer, x25_t20timer_expiry, 0); dev_hold(dev); nb->dev = dev; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1f5cee2269af..065d89606888 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -556,7 +556,7 @@ out: return HRTIMER_NORESTART; } -static void xfrm_replay_timer_handler(unsigned long data); +static void xfrm_replay_timer_handler(struct timer_list *t); struct xfrm_state *xfrm_state_alloc(struct net *net) { @@ -574,8 +574,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) INIT_HLIST_NODE(&x->byspi); tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, CLOCK_BOOTTIME, HRTIMER_MODE_ABS); - setup_timer(&x->rtimer, xfrm_replay_timer_handler, - (unsigned long)x); + timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); x->curlft.add_time = get_seconds(); x->lft.soft_byte_limit = XFRM_INF; x->lft.soft_packet_limit = XFRM_INF; @@ -1879,9 +1878,9 @@ void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net) } EXPORT_SYMBOL(xfrm_state_walk_done); -static void xfrm_replay_timer_handler(unsigned long data) +static void xfrm_replay_timer_handler(struct timer_list *t) { - struct xfrm_state *x = (struct xfrm_state *)data; + struct xfrm_state *x = from_timer(x, t, rtimer); spin_lock(&x->lock); -- cgit v1.2.3 From 86cb30ec07cdc78ad94d94bb3756c7c2d46968b9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Oct 2017 20:21:24 -0700 Subject: treewide: setup_timer() -> timer_setup() (2 field) This converts all remaining setup_timer() calls that use a nested field to reach a struct timer_list. Coccinelle does not have an easy way to match multiple fields, so a new script is needed to change the matches of "&_E->_timer" into "&_E->_field1._timer" in all the rules. spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup-2fields.cocci @fix_address_of depends@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _field1; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_field1._timer, NULL, _E); +timer_setup(&_E->_field1._timer, NULL, 0); | -setup_timer(&_E->_field1._timer, NULL, (_cast_data)_E); +timer_setup(&_E->_field1._timer, NULL, 0); | -setup_timer(&_E._field1._timer, NULL, &_E); +timer_setup(&_E._field1._timer, NULL, 0); | -setup_timer(&_E._field1._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._field1._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _field1; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_field1._timer, _callback, _E); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E->_field1._timer, &_callback, _E); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E->_field1._timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E->_field1._timer, (_cast_func)_callback, _E); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E->_field1._timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E->_field1._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E->_field1._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, _callback, (_cast_data)_E); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._field1._timer, _callback, 0); | _E->_field1._timer@_stl.function = _callback; | _E->_field1._timer@_stl.function = &_callback; | _E->_field1._timer@_stl.function = (_cast_func)_callback; | _E->_field1._timer@_stl.function = (_cast_func)&_callback; | _E._field1._timer@_stl.function = _callback; | _E._field1._timer@_stl.function = &_callback; | _E._field1._timer@_stl.function = (_cast_func)_callback; | _E._field1._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._field1; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _field1._timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _field1._timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _field1._timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _field1._timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._field1; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _field1._timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._field1; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _field1._timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._field1; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _field1._timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._field1; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_field1._timer, _callback, 0); +setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E); | -timer_setup(&_E._field1._timer, _callback, 0); +setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._field1; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_field1._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_field1._timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_field1._timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_field1._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._field1._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._field1._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._field1._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._field1._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._field1; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_field1._timer | -(_cast_data)&_E +&_E._field1._timer | -_E +&_E->_field1._timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _field1; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_field1._timer, _callback, 0); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E->_field1._timer, _callback, 0L); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E->_field1._timer, _callback, 0UL); +timer_setup(&_E->_field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, _callback, 0); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, _callback, 0L); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_E._field1._timer, _callback, 0UL); +timer_setup(&_E._field1._timer, _callback, 0); | -setup_timer(&_field1._timer, _callback, 0); +timer_setup(&_field1._timer, _callback, 0); | -setup_timer(&_field1._timer, _callback, 0L); +timer_setup(&_field1._timer, _callback, 0); | -setup_timer(&_field1._timer, _callback, 0UL); +timer_setup(&_field1._timer, _callback, 0); | -setup_timer(_field1._timer, _callback, 0); +timer_setup(_field1._timer, _callback, 0); | -setup_timer(_field1._timer, _callback, 0L); +timer_setup(_field1._timer, _callback, 0); | -setup_timer(_field1._timer, _callback, 0UL); +timer_setup(_field1._timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook --- net/ipv6/ip6_fib.c | 10 ++++++---- net/ncsi/ncsi-manage.c | 7 +++---- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 2e2804f5823e..f5285f4e1d08 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -70,7 +70,7 @@ static int fib6_walk_continue(struct fib6_walker *w); * result of redirects, path MTU changes, etc. */ -static void fib6_gc_timer_cb(unsigned long arg); +static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) @@ -2026,9 +2026,11 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) spin_unlock_bh(&net->ipv6.fib6_gc_lock); } -static void fib6_gc_timer_cb(unsigned long arg) +static void fib6_gc_timer_cb(struct timer_list *t) { - fib6_run_gc(0, (struct net *)arg, true); + struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); + + fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) @@ -2043,7 +2045,7 @@ static int __net_init fib6_net_init(struct net *net) spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); - setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); + timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 3ccf8dfb233e..c989211bbabc 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -184,9 +184,9 @@ report: nd->handler(nd); } -static void ncsi_channel_monitor(unsigned long data) +static void ncsi_channel_monitor(struct timer_list *t) { - struct ncsi_channel *nc = (struct ncsi_channel *)data; + struct ncsi_channel *nc = from_timer(nc, t, monitor.timer); struct ncsi_package *np = nc->package; struct ncsi_dev_priv *ndp = np->ndp; struct ncsi_channel_mode *ncm; @@ -313,8 +313,7 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) nc->package = np; nc->state = NCSI_CHANNEL_INACTIVE; nc->monitor.enabled = false; - setup_timer(&nc->monitor.timer, - ncsi_channel_monitor, (unsigned long)nc); + timer_setup(&nc->monitor.timer, ncsi_channel_monitor, 0); spin_lock_init(&nc->lock); INIT_LIST_HEAD(&nc->link); for (index = 0; index < NCSI_CAP_MAX; index++) -- cgit v1.2.3 From 841b86f3289dbe858daeceec36423d4ea286fac2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 23 Oct 2017 09:40:42 +0200 Subject: treewide: Remove TIMER_FUNC_TYPE and TIMER_DATA_TYPE casts With all callbacks converted, and the timer callback prototype switched over, the TIMER_FUNC_TYPE cast is no longer needed, so remove it. Conversion was done with the following scripts: perl -pi -e 's|\(TIMER_FUNC_TYPE\)||g' \ $(git grep TIMER_FUNC_TYPE | cut -d: -f1 | sort -u) perl -pi -e 's|\(TIMER_DATA_TYPE\)||g' \ $(git grep TIMER_DATA_TYPE | cut -d: -f1 | sort -u) The now unused macros are also dropped from include/linux/timer.h. Signed-off-by: Kees Cook --- net/atm/lec.c | 6 +++--- net/can/proc.c | 4 ++-- net/lapb/lapb_timer.c | 4 ++-- net/netrom/af_netrom.c | 2 +- net/netrom/nr_timer.c | 2 +- net/rose/rose_link.c | 4 ++-- net/rose/rose_timer.c | 12 ++++++------ net/sunrpc/svc_xprt.c | 2 +- net/x25/af_x25.c | 2 +- net/x25/x25_timer.c | 2 +- 10 files changed, 20 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/atm/lec.c b/net/atm/lec.c index c976196da3ea..6676e3433261 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1798,7 +1798,7 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv, else send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL); entry->timer.expires = jiffies + (1 * HZ); - entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_arp; + entry->timer.function = lec_arp_expire_arp; add_timer(&entry->timer); found = priv->mcast_vcc; } @@ -1998,7 +1998,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, entry->old_recv_push = old_push; entry->status = ESI_UNKNOWN; entry->timer.expires = jiffies + priv->vcc_timeout_period; - entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc; + entry->timer.function = lec_arp_expire_vcc; hlist_add_head(&entry->next, &priv->lec_no_forward); add_timer(&entry->timer); dump_arp_table(priv); @@ -2082,7 +2082,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, entry->status = ESI_UNKNOWN; hlist_add_head(&entry->next, &priv->lec_arp_empty_ones); entry->timer.expires = jiffies + priv->vcc_timeout_period; - entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc; + entry->timer.function = lec_arp_expire_vcc; add_timer(&entry->timer); pr_debug("After vcc was added\n"); dump_arp_table(priv); diff --git a/net/can/proc.c b/net/can/proc.c index d979b3dc49a6..0c59f876fe6f 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -221,7 +221,7 @@ static int can_stats_proc_show(struct seq_file *m, void *v) seq_putc(m, '\n'); - if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) { + if (net->can.can_stattimer.function == can_stat_update) { seq_printf(m, " %8ld %% total match ratio (RXMR)\n", can_stats->total_rx_match_ratio); @@ -291,7 +291,7 @@ static int can_reset_stats_proc_show(struct seq_file *m, void *v) user_reset = 1; - if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) { + if (net->can.can_stattimer.function == can_stat_update) { seq_printf(m, "Scheduled statistic reset #%ld.\n", can_pstats->stats_reset + 1); } else { diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c index 8bb469cb3abe..5d4ae01951b5 100644 --- a/net/lapb/lapb_timer.c +++ b/net/lapb/lapb_timer.c @@ -42,7 +42,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb) { del_timer(&lapb->t1timer); - lapb->t1timer.function = (TIMER_FUNC_TYPE)lapb_t1timer_expiry; + lapb->t1timer.function = lapb_t1timer_expiry; lapb->t1timer.expires = jiffies + lapb->t1; add_timer(&lapb->t1timer); @@ -52,7 +52,7 @@ void lapb_start_t2timer(struct lapb_cb *lapb) { del_timer(&lapb->t2timer); - lapb->t2timer.function = (TIMER_FUNC_TYPE)lapb_t2timer_expiry; + lapb->t2timer.function = lapb_t2timer_expiry; lapb->t2timer.expires = jiffies + lapb->t2; add_timer(&lapb->t2timer); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 2dec3583c97d..7ed9d4422a73 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -284,7 +284,7 @@ void nr_destroy_socket(struct sock *sk) if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ - sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_destroy_timer; + sk->sk_timer.function = nr_destroy_timer; sk->sk_timer.expires = jiffies + 2 * HZ; add_timer(&sk->sk_timer); } else diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 43569aea0f5e..cbd51ed5a2d7 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -45,7 +45,7 @@ void nr_init_timers(struct sock *sk) timer_setup(&nr->idletimer, nr_idletimer_expiry, 0); /* initialized by sock_init_data */ - sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_heartbeat_expiry; + sk->sk_timer.function = nr_heartbeat_expiry; } void nr_start_t1timer(struct sock *sk) diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index cda4c6678ef1..62055d3069d2 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -37,7 +37,7 @@ void rose_start_ftimer(struct rose_neigh *neigh) { del_timer(&neigh->ftimer); - neigh->ftimer.function = (TIMER_FUNC_TYPE)rose_ftimer_expiry; + neigh->ftimer.function = rose_ftimer_expiry; neigh->ftimer.expires = jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout); @@ -48,7 +48,7 @@ static void rose_start_t0timer(struct rose_neigh *neigh) { del_timer(&neigh->t0timer); - neigh->t0timer.function = (TIMER_FUNC_TYPE)rose_t0timer_expiry; + neigh->t0timer.function = rose_t0timer_expiry; neigh->t0timer.expires = jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout); diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index ea613b2a9735..74555fb95615 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -36,7 +36,7 @@ void rose_start_heartbeat(struct sock *sk) { del_timer(&sk->sk_timer); - sk->sk_timer.function = (TIMER_FUNC_TYPE)rose_heartbeat_expiry; + sk->sk_timer.function = rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; add_timer(&sk->sk_timer); @@ -48,7 +48,7 @@ void rose_start_t1timer(struct sock *sk) del_timer(&rose->timer); - rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; + rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t1; add_timer(&rose->timer); @@ -60,7 +60,7 @@ void rose_start_t2timer(struct sock *sk) del_timer(&rose->timer); - rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; + rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t2; add_timer(&rose->timer); @@ -72,7 +72,7 @@ void rose_start_t3timer(struct sock *sk) del_timer(&rose->timer); - rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; + rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t3; add_timer(&rose->timer); @@ -84,7 +84,7 @@ void rose_start_hbtimer(struct sock *sk) del_timer(&rose->timer); - rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; + rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->hb; add_timer(&rose->timer); @@ -97,7 +97,7 @@ void rose_start_idletimer(struct sock *sk) del_timer(&rose->idletimer); if (rose->idle > 0) { - rose->idletimer.function = (TIMER_FUNC_TYPE)rose_idletimer_expiry; + rose->idletimer.function = rose_idletimer_expiry; rose->idletimer.expires = jiffies + rose->idle; add_timer(&rose->idletimer); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e8e0831229cf..f9307bd6644b 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -745,7 +745,7 @@ static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt serv->sv_tmpcnt++; if (serv->sv_temptimer.function == NULL) { /* setup timer to age temp transports */ - serv->sv_temptimer.function = (TIMER_FUNC_TYPE)svc_age_temp_xprts; + serv->sv_temptimer.function = svc_age_temp_xprts; mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index ea87143314f3..562cc11131f6 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -415,7 +415,7 @@ static void __x25_destroy_socket(struct sock *sk) if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; - sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_destroy_timer; + sk->sk_timer.function = x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index 1dfba3c23459..fa3461002b3e 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -36,7 +36,7 @@ void x25_init_timers(struct sock *sk) timer_setup(&x25->timer, x25_timer_expiry, 0); /* initialized by sock_init_data */ - sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_heartbeat_expiry; + sk->sk_timer.function = x25_heartbeat_expiry; } void x25_start_heartbeat(struct sock *sk) -- cgit v1.2.3 From db1ac4964fa172803a0fea83033cd35d380a8a77 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Wed, 22 Nov 2017 18:32:53 +0000 Subject: bpf: introduce ARG_PTR_TO_MEM_OR_NULL With the current ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM semantics, an helper argument can be NULL when the next argument type is ARG_CONST_SIZE_OR_ZERO and the verifier can prove the value of this next argument is 0. However, most helpers are just interested in handling , so forcing them to deal with makes the implementation of those helpers more complicated for no apparent benefits, requiring them to explicitly handle those corner cases with checks that bpf programs could start relying upon, preventing the possibility of removing them later. Solve this by making ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM never accept NULL even when ARG_CONST_SIZE_OR_ZERO is set, and introduce a new argument type ARG_PTR_TO_MEM_OR_NULL to explicitly deal with the NULL case. Currently, the only helper that needs this is bpf_csum_diff_proto(), so change arg1 and arg3 to this new type as well. Also add a new battery of tests that explicitly test the !ARG_PTR_TO_MEM_OR_NULL combination: all the current ones testing the various variations are focused on bpf_csum_diff, so cover also other helpers. Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 1afa17935954..6a85e67fafce 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1646,9 +1646,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM_OR_NULL, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; -- cgit v1.2.3 From d7aa04a5e82b4f254d306926c81eae8df69e5200 Mon Sep 17 00:00:00 2001 From: Roman Kapl Date: Mon, 20 Nov 2017 22:21:13 +0100 Subject: net: sched: fix crash when deleting secondary chains If you flush (delete) a filter chain other than chain 0 (such as when deleting the device), the kernel may run into a use-after-free. The chain refcount must not be decremented unless we are sure we are done with the chain. To reproduce the bug, run: ip link add dtest type dummy tc qdisc add dev dtest ingress tc filter add dev dtest chain 1 parent ffff: flower ip link del dtest Introduced in: commit f93e1cdcf42c ("net/sched: fix filter flushing"), but unless you have KAsan or luck, you won't notice it until commit 0dadc117ac8b ("cls_flower: use tcf_exts_get_net() before call_rcu()") Fixes: f93e1cdcf42c ("net/sched: fix filter flushing") Acked-by: Jiri Pirko Signed-off-by: Roman Kapl Signed-off-by: David S. Miller --- net/sched/cls_api.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ab255b421781..7d97f612c9b9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -205,13 +205,14 @@ static void tcf_chain_head_change(struct tcf_chain *chain, static void tcf_chain_flush(struct tcf_chain *chain) { - struct tcf_proto *tp; + struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); tcf_chain_head_change(chain, NULL); - while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { + while (tp) { RCU_INIT_POINTER(chain->filter_chain, tp->next); - tcf_chain_put(chain); tcf_proto_destroy(tp); + tp = rtnl_dereference(chain->filter_chain); + tcf_chain_put(chain); } } -- cgit v1.2.3 From bbfcd77631573ac4a9f57eb6169e04256a111bc1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 21 Nov 2017 09:50:12 +0200 Subject: ipv6: Do not consider linkdown nexthops during multipath When the 'ignore_routes_with_linkdown' sysctl is set, we should not consider linkdown nexthops during route lookup. While the code correctly verifies that the initially selected route ('match') has a carrier, it does not perform the same check in the subsequent multipath selection, resulting in a potential packet loss. In case the chosen route does not have a carrier and the sysctl is set, choose the initially selected route. Fixes: 35103d11173b ("net: ipv6 sysctl option to ignore routes when nexthop link is down") Signed-off-by: Ido Schimmel Acked-by: David Ahern Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 05eb7bc36156..0363db914c7a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -472,6 +472,11 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, &match->rt6i_siblings, rt6i_siblings) { route_choosen--; if (route_choosen == 0) { + struct inet6_dev *idev = sibling->rt6i_idev; + + if (!netif_carrier_ok(sibling->dst.dev) && + idev->cnf.ignore_routes_with_linkdown) + break; if (rt6_score_route(sibling, oif, strict) < 0) break; match = sibling; -- cgit v1.2.3 From 4e1061f4a2bba1669c7297455c73ddafbebf2b12 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 21 Nov 2017 13:23:53 +0100 Subject: net/smc: use sk_rcvbuf as start for rmb creation Commit 3e034725c0d8 ("net/smc: common functions for RMBs and send buffers") merged handling of SMC receive and send buffers. It introduced sk_buf_size as merged start value for size determination. But since sk_buf_size is not used at all, sk_sndbuf is erroneously used as start for rmb creation. This patch makes sure, sk_buf_size is really used as intended, and sk_rcvbuf is used as start value for rmb creation. Fixes: 3e034725c0d8 ("net/smc: common functions for RMBs and send buffers") Signed-off-by: Ursula Braun Reviewed-by: Hans Wippel Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2578fbd95664..3b5e5d4bc763 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -575,7 +575,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf / 2; - for (bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2); + for (bufsize_short = smc_compress_bufsize(sk_buf_size); bufsize_short >= 0; bufsize_short--) { if (is_rmb) { -- cgit v1.2.3 From 688703702584dd513b50001bd1eb068655631e9b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 21 Nov 2017 13:23:54 +0100 Subject: net/smc: Fix preinitialization of buf_desc in __smc_buf_create() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2: net/smc/smc_core.c: In function ‘__smc_buf_create’: net/smc/smc_core.c:567: warning: ‘bufsize’ may be used uninitialized in this function Indeed, if the for-loop is never executed, bufsize is used uninitialized. In addition, buf_desc is stored for later use, while it is still a NULL pointer. Before, error handling was done by checking if buf_desc is non-NULL. The cleanup changed this to an error check, but forgot to update the preinitialization of buf_desc to an error pointer. Update the preinitializatin of buf_desc to fix this. Fixes: b33982c3a6838d13 ("net/smc: cleanup function __smc_buf_create()") Signed-off-by: Geert Uytterhoeven Signed-off-by: Ursula Braun Acked-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3b5e5d4bc763..94f21116dac5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -562,7 +562,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; - struct smc_buf_desc *buf_desc = NULL; + struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct list_head *buf_list; int bufsize, bufsize_short; int sk_buf_size; -- cgit v1.2.3 From 98d11291d189cb5adf49694d0ad1b971c0212697 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 21 Nov 2017 07:08:57 -0800 Subject: net: ipv6: Fixup device for anycast routes during copy Florian reported a breakage with anycast routes due to commit 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address"). Prior to this commit anycast routes were added against the loopback device causing repetitive route entries with no insight into why they existed. e.g.: $ ip -6 ro ls table local type anycast anycast 2001:db8:1:: dev lo proto kernel metric 0 pref medium anycast 2001:db8:2:: dev lo proto kernel metric 0 pref medium anycast fe80:: dev lo proto kernel metric 0 pref medium anycast fe80:: dev lo proto kernel metric 0 pref medium The point of commit 4832c30d5458 is to add the routes using the device with the address which is causing the route to be added. e.g.,: $ ip -6 ro ls table local type anycast anycast 2001:db8:1:: dev eth1 proto kernel metric 0 pref medium anycast 2001:db8:2:: dev eth2 proto kernel metric 0 pref medium anycast fe80:: dev eth2 proto kernel metric 0 pref medium anycast fe80:: dev eth1 proto kernel metric 0 pref medium For traffic to work as it did before, the dst device needs to be switched to the loopback when the copy is created similar to local routes. Fixes: 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0363db914c7a..7a8d1500d374 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1024,7 +1024,7 @@ static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) { struct net_device *dev = rt->dst.dev; - if (rt->rt6i_flags & RTF_LOCAL) { + if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default -- cgit v1.2.3 From 0c19f846d582af919db66a5914a0189f9f92c936 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 21 Nov 2017 10:22:25 -0500 Subject: net: accept UFO datagrams from tuntap and packet Tuntap and similar devices can inject GSO packets. Accept type VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively. Processes are expected to use feature negotiation such as TUNSETOFFLOAD to detect supported offload types and refrain from injecting other packets. This process breaks down with live migration: guest kernels do not renegotiate flags, so destination hosts need to expose all features that the source host does. Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677. This patch introduces nearly(*) no new code to simplify verification. It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP insertion and software UFO segmentation. It does not reinstate protocol stack support, hardware offload (NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception of VIRTIO_NET_HDR_GSO_UDP packets in tuntap. To support SKB_GSO_UDP reappearing in the stack, also reinstate logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD by squashing in commit 939912216fa8 ("net: skb_needs_check() removes CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1 ("net: avoid skb_warn_bad_offload false positives on UFO"). (*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id, ipv6_proxy_select_ident is changed to return a __be32 and this is assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted at the end of the enum to minimize code churn. Tested Booted a v4.13 guest kernel with QEMU. On a host kernel before this patch `ethtool -k eth0` shows UFO disabled. After the patch, it is enabled, same as on a v4.13 host kernel. A UFO packet sent from the guest appears on the tap device: host: nc -l -p -u 8000 & tcpdump -n -i tap0 guest: dd if=/dev/zero of=payload.txt bs=1 count=2000 nc -u 192.16.1.1 8000 < payload.txt Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds, packets arriving fragmented: ./with_tap_pair.sh ./tap_send_ufo tap0 tap1 (from https://github.com/wdebruij/kerneltools/tree/master/tests) Changes v1 -> v2 - simplified set_offload change (review comment) - documented test procedure Link: http://lkml.kernel.org/r/ Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.") Reported-by: Michal Kubecek Signed-off-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- net/core/dev.c | 3 +- net/ipv4/af_inet.c | 12 +++++-- net/ipv4/udp_offload.c | 49 +++++++++++++++++++++++--- net/ipv6/output_core.c | 31 +++++++++++++++++ net/ipv6/udp_offload.c | 85 ++++++++++++++++++++++++++++++++++++++++++++-- net/openvswitch/datapath.c | 14 ++++++++ net/openvswitch/flow.c | 6 +++- net/sched/act_csum.c | 6 ++++ 8 files changed, 195 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8ee29f4f5fa9..bbba19112f02 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2746,7 +2746,8 @@ EXPORT_SYMBOL(skb_mac_gso_segment); static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL; + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ce4aa827be05..f00499a46927 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1223,9 +1223,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool fixedid = false, gso_partial, encap; + bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; + unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; @@ -1260,6 +1261,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ @@ -1279,7 +1281,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); - if (skb_is_gso(skb)) { + if (udpfrag) { + iph->frag_off = htons(offset >> 3); + if (skb->next) + iph->frag_off |= htons(IP_MF); + offset += skb->len - nhoff - ihl; + tot_len = skb->len - nhoff; + } else if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e360d55be555..01801b77bd0d 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -187,16 +187,57 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); -static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + __wsum csum; + struct udphdr *uh; + struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & - (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { segs = skb_udp_tunnel_segment(skb, features, false); + goto out; + } + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + /* Do software UFO. Complete and fill in the UDP checksum as + * HW cannot do checksum of UDP packets sent as multiple + * IP fragments. + */ + uh = udp_hdr(skb); + iph = ip_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + + /* Fragment the skb. IP headers of the fragments are updated in + * inet_gso_segment() + */ + segs = skb_segment(skb, features); +out: return segs; } @@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv4_offload = { .callbacks = { - .gso_segment = udp4_tunnel_segment, + .gso_segment = udp4_ufo_fragment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, }, diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 4a7e5ffa5108..4fe7c90962dd 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -31,6 +31,37 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, return id; } +/* This function exists only for tap drivers that must support broken + * clients requesting UFO without specifying an IPv6 fragment ID. + * + * This is similar to ipv6_select_ident() but we use an independent hash + * seed to limit information leakage. + * + * The network header must be set before calling this. + */ +__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) +{ + static u32 ip6_proxy_idents_hashrnd __read_mostly; + struct in6_addr buf[2]; + struct in6_addr *addrs; + u32 id; + + addrs = skb_header_pointer(skb, + skb_network_offset(skb) + + offsetof(struct ipv6hdr, saddr), + sizeof(buf), buf); + if (!addrs) + return 0; + + net_get_random_once(&ip6_proxy_idents_hashrnd, + sizeof(ip6_proxy_idents_hashrnd)); + + id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, + &addrs[1], &addrs[0]); + return htonl(id); +} +EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); + __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 455fd4e39333..a0f89ad76f9d 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -17,15 +17,94 @@ #include #include "ip6_offload.h" -static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + unsigned int unfrag_ip6hlen, unfrag_len; + struct frag_hdr *fptr; + u8 *packet_start, *prevhdr; + u8 nexthdr; + u8 frag_hdr_sz = sizeof(struct frag_hdr); + __wsum csum; + int tnl_hlen; + int err; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features, true); + else { + const struct ipv6hdr *ipv6h; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + + uh = udp_hdr(skb); + ipv6h = ipv6_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v6_check(skb->len, &ipv6h->saddr, + &ipv6h->daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + + /* Check if there is enough headroom to insert fragment header. */ + tnl_hlen = skb_tnl_header_len(skb); + if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) + goto out; + } + + /* Find the unfragmentable header and shift it left by frag_hdr_sz + * bytes to insert fragment header. + */ + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + unfrag_ip6hlen = err; + nexthdr = *prevhdr; + *prevhdr = NEXTHDR_FRAGMENT; + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + + unfrag_ip6hlen + tnl_hlen; + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); + + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; + skb->mac_header -= frag_hdr_sz; + skb->network_header -= frag_hdr_sz; + + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); + fptr->nexthdr = nexthdr; + fptr->reserved = 0; + fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb); + + /* Fragment the skb. ipv6 header and the remaining fields of the + * fragment header are updated in ipv6_gso_segment() + */ + segs = skb_segment(skb, features); + } +out: return segs; } @@ -75,7 +154,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv6_offload = { .callbacks = { - .gso_segment = udp6_tunnel_segment, + .gso_segment = udp6_ufo_fragment, .gro_receive = udp6_gro_receive, .gro_complete = udp6_gro_complete, }, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0dab33fb9844..99cfafc2a139 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -308,6 +308,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { + unsigned short gso_type = skb_shinfo(skb)->gso_type; + struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; @@ -318,9 +320,21 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (segs == NULL) return -EINVAL; + if (gso_type & SKB_GSO_UDP) { + /* The initial flow key extracted by ovs_flow_key_extract() + * in this case is for a first fragment, so we need to + * properly mark later fragments. + */ + later_key = *key; + later_key.ip.frag = OVS_FRAG_TYPE_LATER; + } + /* Queue all of the segments. */ skb = segs; do { + if (gso_type & SKB_GSO_UDP && skb != segs) + key = &later_key; + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 864ddb1e3642..dbe2379329c5 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -631,7 +631,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_LATER; return 0; } - if (nh->frag_off & htons(IP_MF)) + if (nh->frag_off & htons(IP_MF) || + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; else key->ip.frag = OVS_FRAG_TYPE_NONE; @@ -747,6 +748,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (key->ip.frag == OVS_FRAG_TYPE_LATER) return 0; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + /* Transport layer. */ if (key->ip.proto == NEXTHDR_TCP) { if (tcphdr_ok(skb)) { diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 1c40caadcff9..d836f998117b 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -229,6 +229,9 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, const struct iphdr *iph; u16 ul; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + return 1; + /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, @@ -282,6 +285,9 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, const struct ipv6hdr *ip6h; u16 ul; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + return 1; + /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, -- cgit v1.2.3 From 03a6c82218b9a87014b2c6c4e178294fdc8ebd8a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:40 +0000 Subject: rxrpc: The mutex lock returned by rxrpc_accept_call() needs releasing The caller of rxrpc_accept_call() must release the lock on call->user_mutex returned by that function. Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 7d2595582c09..3a99b1a908df 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -619,8 +619,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) /* The socket is now unlocked. */ if (IS_ERR(call)) return PTR_ERR(call); - rxrpc_put_call(call, rxrpc_call_put); - return 0; + ret = 0; + goto out_put_unlock; } call = rxrpc_find_call_by_user_ID(rx, p.user_call_ID); @@ -689,6 +689,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_send_data(rx, call, msg, len, NULL); } +out_put_unlock: mutex_unlock(&call->user_mutex); error_put: rxrpc_put_call(call, rxrpc_call_put); -- cgit v1.2.3 From 48ca24636d5a26f1b7b8e69c54bccf19394843e5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:40 +0000 Subject: rxrpc: Don't set upgrade by default in sendmsg() Don't set upgrade by default when creating a call from sendmsg(). This is a holdover from when I was testing the code. Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 3a99b1a908df..94555c94b2d8 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -602,7 +602,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) .abort_code = 0, .command = RXRPC_CMD_SEND_DATA, .exclusive = false, - .upgrade = true, + .upgrade = false, }; _enter(""); -- cgit v1.2.3 From 9faaff593404a9c4e5abc6839a641635d7b9d0cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:40 +0000 Subject: rxrpc: Provide a different lockdep key for call->user_mutex for kernel calls Provide a different lockdep key for rxrpc_call::user_mutex when the call is made on a kernel socket, such as by the AFS filesystem. The problem is that lockdep registers a false positive between userspace calling the sendmsg syscall on a user socket where call->user_mutex is held whilst userspace memory is accessed whereas the AFS filesystem may perform operations with mmap_sem held by the caller. In such a case, the following warning is produced. ====================================================== WARNING: possible circular locking dependency detected 4.14.0-fscache+ #243 Tainted: G E ------------------------------------------------------ modpost/16701 is trying to acquire lock: (&vnode->io_lock){+.+.}, at: [] afs_begin_vnode_operation+0x33/0x77 [kafs] but task is already holding lock: (&mm->mmap_sem){++++}, at: [] __do_page_fault+0x1ef/0x486 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){++++}: __might_fault+0x61/0x89 _copy_from_iter_full+0x40/0x1fa rxrpc_send_data+0x8dc/0xff3 rxrpc_do_sendmsg+0x62f/0x6a1 rxrpc_sendmsg+0x166/0x1b7 sock_sendmsg+0x2d/0x39 ___sys_sendmsg+0x1ad/0x22b __sys_sendmsg+0x41/0x62 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 -> #2 (&call->user_mutex){+.+.}: __mutex_lock+0x86/0x7d2 rxrpc_new_client_call+0x378/0x80e rxrpc_kernel_begin_call+0xf3/0x154 afs_make_call+0x195/0x454 [kafs] afs_vl_get_capabilities+0x193/0x198 [kafs] afs_vl_lookup_vldb+0x5f/0x151 [kafs] afs_create_volume+0x2e/0x2f4 [kafs] afs_mount+0x56a/0x8d7 [kafs] mount_fs+0x6a/0x109 vfs_kern_mount+0x67/0x135 do_mount+0x90b/0xb57 SyS_mount+0x72/0x98 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 -> #1 (k-sk_lock-AF_RXRPC){+.+.}: lock_sock_nested+0x74/0x8a rxrpc_kernel_begin_call+0x8a/0x154 afs_make_call+0x195/0x454 [kafs] afs_fs_get_capabilities+0x17a/0x17f [kafs] afs_probe_fileserver+0xf7/0x2f0 [kafs] afs_select_fileserver+0x83f/0x903 [kafs] afs_fetch_status+0x89/0x11d [kafs] afs_iget+0x16f/0x4f8 [kafs] afs_mount+0x6c6/0x8d7 [kafs] mount_fs+0x6a/0x109 vfs_kern_mount+0x67/0x135 do_mount+0x90b/0xb57 SyS_mount+0x72/0x98 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 -> #0 (&vnode->io_lock){+.+.}: lock_acquire+0x174/0x19f __mutex_lock+0x86/0x7d2 afs_begin_vnode_operation+0x33/0x77 [kafs] afs_fetch_data+0x80/0x12a [kafs] afs_readpages+0x314/0x405 [kafs] __do_page_cache_readahead+0x203/0x2ba filemap_fault+0x179/0x54d __do_fault+0x17/0x60 __handle_mm_fault+0x6d7/0x95c handle_mm_fault+0x24e/0x2a3 __do_page_fault+0x301/0x486 do_page_fault+0x236/0x259 page_fault+0x22/0x30 __clear_user+0x3d/0x60 padzero+0x1c/0x2b load_elf_binary+0x785/0xdc7 search_binary_handler+0x81/0x1ff do_execveat_common.isra.14+0x600/0x888 do_execve+0x1f/0x21 SyS_execve+0x28/0x2f do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 other info that might help us debug this: Chain exists of: &vnode->io_lock --> &call->user_mutex --> &mm->mmap_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(&call->user_mutex); lock(&mm->mmap_sem); lock(&vnode->io_lock); *** DEADLOCK *** 1 lock held by modpost/16701: #0: (&mm->mmap_sem){++++}, at: [] __do_page_fault+0x1ef/0x486 stack backtrace: CPU: 0 PID: 16701 Comm: modpost Tainted: G E 4.14.0-fscache+ #243 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 Call Trace: dump_stack+0x67/0x8e print_circular_bug+0x341/0x34f check_prev_add+0x11f/0x5d4 ? add_lock_to_list.isra.12+0x8b/0x8b ? add_lock_to_list.isra.12+0x8b/0x8b ? __lock_acquire+0xf77/0x10b4 __lock_acquire+0xf77/0x10b4 lock_acquire+0x174/0x19f ? afs_begin_vnode_operation+0x33/0x77 [kafs] __mutex_lock+0x86/0x7d2 ? afs_begin_vnode_operation+0x33/0x77 [kafs] ? afs_begin_vnode_operation+0x33/0x77 [kafs] ? afs_begin_vnode_operation+0x33/0x77 [kafs] afs_begin_vnode_operation+0x33/0x77 [kafs] afs_fetch_data+0x80/0x12a [kafs] afs_readpages+0x314/0x405 [kafs] __do_page_cache_readahead+0x203/0x2ba ? filemap_fault+0x179/0x54d filemap_fault+0x179/0x54d __do_fault+0x17/0x60 __handle_mm_fault+0x6d7/0x95c handle_mm_fault+0x24e/0x2a3 __do_page_fault+0x301/0x486 do_page_fault+0x236/0x259 page_fault+0x22/0x30 RIP: 0010:__clear_user+0x3d/0x60 RSP: 0018:ffff880071e93da0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000000000011c RCX: 000000000000011c RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000000000060f720 RBP: 000000000060f720 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: ffff8800b5459b68 R12: ffff8800ce150e00 R13: 000000000060f720 R14: 00000000006127a8 R15: 0000000000000000 padzero+0x1c/0x2b load_elf_binary+0x785/0xdc7 search_binary_handler+0x81/0x1ff do_execveat_common.isra.14+0x600/0x888 do_execve+0x1f/0x21 SyS_execve+0x28/0x2f do_syscall_64+0x89/0x1be entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7fdb6009ee07 RSP: 002b:00007fff566d9728 EFLAGS: 00000246 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 000055ba57280900 RCX: 00007fdb6009ee07 RDX: 000055ba5727f270 RSI: 000055ba5727cac0 RDI: 000055ba57280900 RBP: 000055ba57280900 R08: 00007fff566d9700 R09: 0000000000000000 R10: 000055ba5727cac0 R11: 0000000000000246 R12: 0000000000000000 R13: 000055ba5727cac0 R14: 000055ba5727f270 R15: 0000000000000000 Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 2 +- net/rxrpc/call_accept.c | 2 +- net/rxrpc/call_object.c | 19 +++++++++++++++---- 3 files changed, 17 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index b2151993d384..a972887b3f5d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -672,7 +672,7 @@ extern unsigned int rxrpc_max_call_lifetime; extern struct kmem_cache *rxrpc_call_jar; struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); -struct rxrpc_call *rxrpc_alloc_call(gfp_t); +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index cbd1701e813a..3028298ca561 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -94,7 +94,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, /* Now it gets complicated, because calls get registered with the * socket here, particularly if a user ID is preassigned by the user. */ - call = rxrpc_alloc_call(gfp); + call = rxrpc_alloc_call(rx, gfp); if (!call) return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 4c7fbc6dcce7..1f141dc08ad2 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -55,6 +55,8 @@ static void rxrpc_call_timer_expired(unsigned long _call) rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real()); } +static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; + /* * find an extant server call * - called in process context with IRQs enabled @@ -95,7 +97,7 @@ found_extant_call: /* * allocate a new call */ -struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) { struct rxrpc_call *call; @@ -114,6 +116,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) goto nomem_2; mutex_init(&call->user_mutex); + + /* Prevent lockdep reporting a deadlock false positive between the afs + * filesystem and sys_sendmsg() via the mmap sem. + */ + if (rx->sk.sk_kern_sock) + lockdep_set_class(&call->user_mutex, + &rxrpc_call_user_mutex_lock_class_key); + setup_timer(&call->timer, rxrpc_call_timer_expired, (unsigned long)call); INIT_WORK(&call->processor, &rxrpc_process_call); @@ -151,7 +161,8 @@ nomem: /* * Allocate a new client call. */ -static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, +static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, + struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_call *call; @@ -159,7 +170,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, _enter(""); - call = rxrpc_alloc_call(gfp); + call = rxrpc_alloc_call(rx, gfp); if (!call) return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; @@ -210,7 +221,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, _enter("%p,%lx", rx, user_call_ID); - call = rxrpc_alloc_client_call(srx, gfp); + call = rxrpc_alloc_client_call(rx, srx, gfp); if (IS_ERR(call)) { release_sock(&rx->sk); _leave(" = %ld", PTR_ERR(call)); -- cgit v1.2.3 From 3136ef49a14ccc148becf813074e08fc92fc9b23 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:41 +0000 Subject: rxrpc: Delay terminal ACK transmission on a client call Delay terminal ACK transmission on a client call by deferring it to the connection processor. This allows it to be skipped if we can send the next call instead, the first DATA packet of which will implicitly ack this call. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 17 ++++++++++++ net/rxrpc/conn_client.c | 18 ++++++++++++ net/rxrpc/conn_event.c | 74 ++++++++++++++++++++++++++++++++++++++++--------- net/rxrpc/conn_object.c | 10 +++++++ net/rxrpc/recvmsg.c | 2 ++ 5 files changed, 108 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a972887b3f5d..d1213d503f30 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -338,8 +338,17 @@ enum rxrpc_conn_flag { RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */ RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */ + RXRPC_CONN_FINAL_ACK_0, /* Need final ACK for channel 0 */ + RXRPC_CONN_FINAL_ACK_1, /* Need final ACK for channel 1 */ + RXRPC_CONN_FINAL_ACK_2, /* Need final ACK for channel 2 */ + RXRPC_CONN_FINAL_ACK_3, /* Need final ACK for channel 3 */ }; +#define RXRPC_CONN_FINAL_ACK_MASK ((1UL << RXRPC_CONN_FINAL_ACK_0) | \ + (1UL << RXRPC_CONN_FINAL_ACK_1) | \ + (1UL << RXRPC_CONN_FINAL_ACK_2) | \ + (1UL << RXRPC_CONN_FINAL_ACK_3)) + /* * Events that can be raised upon a connection. */ @@ -393,6 +402,7 @@ struct rxrpc_connection { #define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1) struct list_head waiting_calls; /* Calls waiting for channels */ struct rxrpc_channel { + unsigned long final_ack_at; /* Time at which to issue final ACK */ struct rxrpc_call __rcu *call; /* Active call */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ @@ -404,6 +414,7 @@ struct rxrpc_connection { }; } channels[RXRPC_MAXCALLS]; + struct timer_list timer; /* Conn event timer */ struct work_struct processor; /* connection event processor */ union { struct rb_node client_node; /* Node in local->client_conns */ @@ -861,6 +872,12 @@ static inline void rxrpc_put_connection(struct rxrpc_connection *conn) rxrpc_put_service_conn(conn); } +static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, + unsigned long expire_at) +{ + timer_reduce(&conn->timer, expire_at); +} + /* * conn_service.c */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 5f9624bd311c..cfb997593da9 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -554,6 +554,11 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); + /* Cancel the final ACK on the previous call if it hasn't been sent yet + * as the DATA packet will implicitly ACK it. + */ + clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); + write_lock_bh(&call->state_lock); if (!test_bit(RXRPC_CALL_TX_LASTQ, &call->flags)) call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; @@ -813,6 +818,19 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) goto out_2; } + /* Schedule the final ACK to be transmitted in a short while so that it + * can be skipped if we find a follow-on call. The first DATA packet + * of the follow on call will implicitly ACK this call. + */ + if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { + unsigned long final_ack_at = jiffies + 2; + + WRITE_ONCE(chan->final_ack_at, final_ack_at); + smp_wmb(); /* vs rxrpc_process_delayed_final_acks() */ + set_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); + rxrpc_reduce_conn_timer(conn, final_ack_at); + } + /* Things are more complex and we need the cache lock. We might be * able to simply idle the conn or it might now be lurking on the wait * list. It might even get moved back to the active list whilst we're diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 59a51a56e7c8..9e9a8db1bc9c 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -24,9 +24,10 @@ * Retransmit terminal ACK or ABORT of the previous call. */ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, - struct sk_buff *skb) + struct sk_buff *skb, + unsigned int channel) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_skb_priv *sp = skb ? rxrpc_skb(skb) : NULL; struct rxrpc_channel *chan; struct msghdr msg; struct kvec iov; @@ -48,7 +49,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, _enter("%d", conn->debug_id); - chan = &conn->channels[sp->hdr.cid & RXRPC_CHANNELMASK]; + chan = &conn->channels[channel]; /* If the last call got moved on whilst we were waiting to run, just * ignore this packet. @@ -56,7 +57,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, call_id = READ_ONCE(chan->last_call); /* Sync with __rxrpc_disconnect_call() */ smp_rmb(); - if (call_id != sp->hdr.callNumber) + if (skb && call_id != sp->hdr.callNumber) return; msg.msg_name = &conn->params.peer->srx.transport; @@ -65,9 +66,9 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, msg.msg_controllen = 0; msg.msg_flags = 0; - pkt.whdr.epoch = htonl(sp->hdr.epoch); - pkt.whdr.cid = htonl(sp->hdr.cid); - pkt.whdr.callNumber = htonl(sp->hdr.callNumber); + pkt.whdr.epoch = htonl(conn->proto.epoch); + pkt.whdr.cid = htonl(conn->proto.cid); + pkt.whdr.callNumber = htonl(call_id); pkt.whdr.seq = 0; pkt.whdr.type = chan->last_type; pkt.whdr.flags = conn->out_clientflag; @@ -87,11 +88,11 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, mtu = conn->params.peer->if_mtu; mtu -= conn->params.peer->hdrsize; pkt.ack.bufferSpace = 0; - pkt.ack.maxSkew = htons(skb->priority); - pkt.ack.firstPacket = htonl(chan->last_seq); - pkt.ack.previousPacket = htonl(chan->last_seq - 1); - pkt.ack.serial = htonl(sp->hdr.serial); - pkt.ack.reason = RXRPC_ACK_DUPLICATE; + pkt.ack.maxSkew = htons(skb ? skb->priority : 0); + pkt.ack.firstPacket = htonl(chan->last_seq + 1); + pkt.ack.previousPacket = htonl(chan->last_seq); + pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); + pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; pkt.info.rxMTU = htonl(rxrpc_rx_mtu); pkt.info.maxMTU = htonl(mtu); @@ -272,7 +273,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: case RXRPC_PACKET_TYPE_ACK: - rxrpc_conn_retransmit_call(conn, skb); + rxrpc_conn_retransmit_call(conn, skb, + sp->hdr.cid & RXRPC_CHANNELMASK); return 0; case RXRPC_PACKET_TYPE_BUSY: @@ -378,6 +380,48 @@ abort: _leave(" [aborted]"); } +/* + * Process delayed final ACKs that we haven't subsumed into a subsequent call. + */ +static void rxrpc_process_delayed_final_acks(struct rxrpc_connection *conn) +{ + unsigned long j = jiffies, next_j; + unsigned int channel; + bool set; + +again: + next_j = j + LONG_MAX; + set = false; + for (channel = 0; channel < RXRPC_MAXCALLS; channel++) { + struct rxrpc_channel *chan = &conn->channels[channel]; + unsigned long ack_at; + + if (!test_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags)) + continue; + + smp_rmb(); /* vs rxrpc_disconnect_client_call */ + ack_at = READ_ONCE(chan->final_ack_at); + + if (time_before(j, ack_at)) { + if (time_before(ack_at, next_j)) { + next_j = ack_at; + set = true; + } + continue; + } + + if (test_and_clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, + &conn->flags)) + rxrpc_conn_retransmit_call(conn, NULL, channel); + } + + j = jiffies; + if (time_before_eq(next_j, j)) + goto again; + if (set) + rxrpc_reduce_conn_timer(conn, next_j); +} + /* * connection-level event processor */ @@ -394,6 +438,10 @@ void rxrpc_process_connection(struct work_struct *work) if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); + /* Process delayed ACKs whose time has come. */ + if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) + rxrpc_process_delayed_final_acks(conn); + /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index fe575798592f..a01a3791f06a 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -24,6 +24,14 @@ unsigned int rxrpc_connection_expiry = 10 * 60; static void rxrpc_destroy_connection(struct rcu_head *); +static void rxrpc_connection_timer(struct timer_list *timer) +{ + struct rxrpc_connection *conn = + container_of(timer, struct rxrpc_connection, timer); + + rxrpc_queue_conn(conn); +} + /* * allocate a new connection */ @@ -38,6 +46,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) INIT_LIST_HEAD(&conn->cache_link); spin_lock_init(&conn->channel_lock); INIT_LIST_HEAD(&conn->waiting_calls); + timer_setup(&conn->timer, &rxrpc_connection_timer, 0); INIT_WORK(&conn->processor, &rxrpc_process_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); @@ -332,6 +341,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) _net("DESTROY CONN %d", conn->debug_id); + del_timer_sync(&conn->timer); rxrpc_purge_queue(&conn->rx_queue); conn->security->clear(conn); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 8510a98b87e1..be0b9ae13893 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -144,11 +144,13 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); +#if 0 // TODO: May want to transmit final ACK under some circumstances anyway if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false, rxrpc_propose_ack_terminal_ack); rxrpc_send_ack_packet(call, false); } +#endif write_lock_bh(&call->state_lock); -- cgit v1.2.3 From 4812417894770f8c13e5dd8a66479ae44f4b01ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:41 +0000 Subject: rxrpc: Split the call params from the operation params When rxrpc_sendmsg() parses the control message buffer, it places the parameters extracted into a structure, but lumps together call parameters (such as user call ID) with operation parameters (such as whether to send data, send an abort or accept a call). Split the call parameters out into their own structure, a copy of which is then embedded in the operation parameters struct. The call parameters struct is then passed down into the places that need it instead of passing the individual parameters. This allows for extra call parameters to be added. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 8 ++++++-- net/rxrpc/ar-internal.h | 31 +++++++++++++++++++++++++++++- net/rxrpc/call_object.c | 15 +++++++-------- net/rxrpc/sendmsg.c | 51 +++++++++++++++++-------------------------------- 4 files changed, 60 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 9b5c46b052fd..c0cdcf980ffc 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -285,6 +285,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, bool upgrade) { struct rxrpc_conn_parameters cp; + struct rxrpc_call_params p; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; @@ -302,6 +303,10 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, if (key && !key->payload.data[0]) key = NULL; /* a no-security key */ + memset(&p, 0, sizeof(p)); + p.user_call_ID = user_call_ID; + p.tx_total_len = tx_total_len; + memset(&cp, 0, sizeof(cp)); cp.local = rx->local; cp.key = key; @@ -309,8 +314,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.exclusive = false; cp.upgrade = upgrade; cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, tx_total_len, - gfp); + call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp); /* The socket has been unlocked. */ if (!IS_ERR(call)) { call->notify_rx = notify_rx; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d1213d503f30..ba63f2231107 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -643,6 +643,35 @@ struct rxrpc_ack_summary { u8 cumulative_acks; }; +/* + * sendmsg() cmsg-specified parameters. + */ +enum rxrpc_command { + RXRPC_CMD_SEND_DATA, /* send data message */ + RXRPC_CMD_SEND_ABORT, /* request abort generation */ + RXRPC_CMD_ACCEPT, /* [server] accept incoming call */ + RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ +}; + +struct rxrpc_call_params { + s64 tx_total_len; /* Total Tx data length (if send data) */ + unsigned long user_call_ID; /* User's call ID */ + struct { + u32 hard; /* Maximum lifetime (sec) */ + u32 idle; /* Max time since last data packet (msec) */ + u32 normal; /* Max time since last call packet (msec) */ + } timeouts; + u8 nr_timeouts; /* Number of timeouts specified */ +}; + +struct rxrpc_send_params { + struct rxrpc_call_params call; + u32 abort_code; /* Abort code to Tx (if abort) */ + enum rxrpc_command command : 8; /* The command to implement */ + bool exclusive; /* Shared or exclusive call */ + bool upgrade; /* If the connection is upgradeable */ +}; + #include /* @@ -687,7 +716,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, - unsigned long, s64, gfp_t); + struct rxrpc_call_params *, gfp_t); int rxrpc_retry_client_call(struct rxrpc_sock *, struct rxrpc_call *, struct rxrpc_conn_parameters *, diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 1f141dc08ad2..c3e1fa854471 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -208,8 +208,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, - unsigned long user_call_ID, - s64 tx_total_len, + struct rxrpc_call_params *p, gfp_t gfp) __releases(&rx->sk.sk_lock.slock) { @@ -219,7 +218,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, const void *here = __builtin_return_address(0); int ret; - _enter("%p,%lx", rx, user_call_ID); + _enter("%p,%lx", rx, p->user_call_ID); call = rxrpc_alloc_client_call(rx, srx, gfp); if (IS_ERR(call)) { @@ -228,9 +227,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return call; } - call->tx_total_len = tx_total_len; + call->tx_total_len = p->tx_total_len; trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), - here, (const void *)user_call_ID); + here, (const void *)p->user_call_ID); /* We need to protect a partially set up call against the user as we * will be acting outside the socket lock. @@ -246,16 +245,16 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, parent = *pp; xcall = rb_entry(parent, struct rxrpc_call, sock_node); - if (user_call_ID < xcall->user_call_ID) + if (p->user_call_ID < xcall->user_call_ID) pp = &(*pp)->rb_left; - else if (user_call_ID > xcall->user_call_ID) + else if (p->user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else goto error_dup_user_ID; } rcu_assign_pointer(call->socket, rx); - call->user_call_ID = user_call_ID; + call->user_call_ID = p->user_call_ID; __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); rxrpc_get_call(call, rxrpc_call_got_userid); rb_link_node(&call->sock_node, parent, pp); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 94555c94b2d8..de5ab327c18a 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -21,22 +21,6 @@ #include #include "ar-internal.h" -enum rxrpc_command { - RXRPC_CMD_SEND_DATA, /* send data message */ - RXRPC_CMD_SEND_ABORT, /* request abort generation */ - RXRPC_CMD_ACCEPT, /* [server] accept incoming call */ - RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ -}; - -struct rxrpc_send_params { - s64 tx_total_len; /* Total Tx data length (if send data) */ - unsigned long user_call_ID; /* User's call ID */ - u32 abort_code; /* Abort code to Tx (if abort) */ - enum rxrpc_command command : 8; /* The command to implement */ - bool exclusive; /* Shared or exclusive call */ - bool upgrade; /* If the connection is upgradeable */ -}; - /* * Wait for space to appear in the Tx queue or a signal to occur. */ @@ -480,11 +464,11 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) if (msg->msg_flags & MSG_CMSG_COMPAT) { if (len != sizeof(u32)) return -EINVAL; - p->user_call_ID = *(u32 *)CMSG_DATA(cmsg); + p->call.user_call_ID = *(u32 *)CMSG_DATA(cmsg); } else { if (len != sizeof(unsigned long)) return -EINVAL; - p->user_call_ID = *(unsigned long *) + p->call.user_call_ID = *(unsigned long *) CMSG_DATA(cmsg); } got_user_ID = true; @@ -522,10 +506,10 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) break; case RXRPC_TX_LENGTH: - if (p->tx_total_len != -1 || len != sizeof(__s64)) + if (p->call.tx_total_len != -1 || len != sizeof(__s64)) return -EINVAL; - p->tx_total_len = *(__s64 *)CMSG_DATA(cmsg); - if (p->tx_total_len < 0) + p->call.tx_total_len = *(__s64 *)CMSG_DATA(cmsg); + if (p->call.tx_total_len < 0) return -EINVAL; break; @@ -536,7 +520,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) if (!got_user_ID) return -EINVAL; - if (p->tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA) + if (p->call.tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; _leave(" = 0"); return 0; @@ -576,8 +560,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, p->user_call_ID, - p->tx_total_len, GFP_KERNEL); + call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL); /* The socket is now unlocked */ _leave(" = %p\n", call); @@ -597,12 +580,12 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) int ret; struct rxrpc_send_params p = { - .tx_total_len = -1, - .user_call_ID = 0, - .abort_code = 0, - .command = RXRPC_CMD_SEND_DATA, - .exclusive = false, - .upgrade = false, + .call.tx_total_len = -1, + .call.user_call_ID = 0, + .abort_code = 0, + .command = RXRPC_CMD_SEND_DATA, + .exclusive = false, + .upgrade = false, }; _enter(""); @@ -615,7 +598,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) goto error_release_sock; - call = rxrpc_accept_call(rx, p.user_call_ID, NULL); + call = rxrpc_accept_call(rx, p.call.user_call_ID, NULL); /* The socket is now unlocked. */ if (IS_ERR(call)) return PTR_ERR(call); @@ -623,7 +606,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) goto out_put_unlock; } - call = rxrpc_find_call_by_user_ID(rx, p.user_call_ID); + call = rxrpc_find_call_by_user_ID(rx, p.call.user_call_ID); if (!call) { ret = -EBADSLT; if (p.command != RXRPC_CMD_SEND_DATA) @@ -653,13 +636,13 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) goto error_put; } - if (p.tx_total_len != -1) { + if (p.call.tx_total_len != -1) { ret = -EINVAL; if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) goto error_put; - call->tx_total_len = p.tx_total_len; + call->tx_total_len = p.call.tx_total_len; } } -- cgit v1.2.3 From a158bdd3247b9656df36ba133235fff702e9fdc3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:41 +0000 Subject: rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 37 ++++++---- net/rxrpc/call_event.c | 179 +++++++++++++++++++++++------------------------- net/rxrpc/call_object.c | 27 +++++--- net/rxrpc/conn_client.c | 4 +- net/rxrpc/input.c | 34 +++++++-- net/rxrpc/misc.c | 19 ++--- net/rxrpc/recvmsg.c | 2 +- net/rxrpc/sendmsg.c | 59 +++++++++++++--- net/rxrpc/sysctl.c | 60 ++++++++-------- 9 files changed, 242 insertions(+), 179 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ba63f2231107..548411371048 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -468,9 +468,9 @@ enum rxrpc_call_flag { enum rxrpc_call_event { RXRPC_CALL_EV_ACK, /* need to generate ACK */ RXRPC_CALL_EV_ABORT, /* need to generate abort */ - RXRPC_CALL_EV_TIMER, /* Timer expired */ RXRPC_CALL_EV_RESEND, /* Tx resend required */ RXRPC_CALL_EV_PING, /* Ping send required */ + RXRPC_CALL_EV_EXPIRED, /* Expiry occurred */ }; /* @@ -514,10 +514,14 @@ struct rxrpc_call { struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ struct mutex user_mutex; /* User access mutex */ - ktime_t ack_at; /* When deferred ACK needs to happen */ - ktime_t resend_at; /* When next resend needs to happen */ - ktime_t ping_at; /* When next to send a ping */ - ktime_t expire_at; /* When the call times out */ + unsigned long ack_at; /* When deferred ACK needs to happen */ + unsigned long resend_at; /* When next resend needs to happen */ + unsigned long ping_at; /* When next to send a ping */ + unsigned long expect_rx_by; /* When we expect to get a packet by */ + unsigned long expect_req_by; /* When we expect to get a request DATA packet by */ + unsigned long expect_term_by; /* When we expect call termination by */ + u32 next_rx_timo; /* Timeout for next Rx packet (jif) */ + u32 next_req_timo; /* Timeout for next Rx request packet (jif) */ struct timer_list timer; /* Combined event timer */ struct work_struct processor; /* Event processor */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ @@ -697,12 +701,19 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t); -void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool, enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); +static inline void rxrpc_reduce_call_timer(struct rxrpc_call *call, + unsigned long expire_at, + unsigned long now, + enum rxrpc_timer_trace why) +{ + trace_rxrpc_timer(call, why, now); + timer_reduce(&call->timer, expire_at); +} + /* * call_object.c */ @@ -843,8 +854,8 @@ static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, */ extern unsigned int rxrpc_max_client_connections; extern unsigned int rxrpc_reap_client_connections; -extern unsigned int rxrpc_conn_idle_client_expiry; -extern unsigned int rxrpc_conn_idle_client_fast_expiry; +extern unsigned long rxrpc_conn_idle_client_expiry; +extern unsigned long rxrpc_conn_idle_client_fast_expiry; extern struct idr rxrpc_client_conn_ids; void rxrpc_destroy_client_conn_ids(void); @@ -976,13 +987,13 @@ static inline void rxrpc_queue_local(struct rxrpc_local *local) * misc.c */ extern unsigned int rxrpc_max_backlog __read_mostly; -extern unsigned int rxrpc_requested_ack_delay; -extern unsigned int rxrpc_soft_ack_delay; -extern unsigned int rxrpc_idle_ack_delay; +extern unsigned long rxrpc_requested_ack_delay; +extern unsigned long rxrpc_soft_ack_delay; +extern unsigned long rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; -extern unsigned int rxrpc_resend_timeout; +extern unsigned long rxrpc_resend_timeout; extern const s8 rxrpc_ack_priority[]; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 3574508baf9a..c14395d5ad8c 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -21,80 +21,6 @@ #include #include "ar-internal.h" -/* - * Set the timer - */ -void __rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, - ktime_t now) -{ - unsigned long t_j, now_j = jiffies; - ktime_t t; - bool queue = false; - - if (call->state < RXRPC_CALL_COMPLETE) { - t = call->expire_at; - if (!ktime_after(t, now)) { - trace_rxrpc_timer(call, why, now, now_j); - queue = true; - goto out; - } - - if (!ktime_after(call->resend_at, now)) { - call->resend_at = call->expire_at; - if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - queue = true; - } else if (ktime_before(call->resend_at, t)) { - t = call->resend_at; - } - - if (!ktime_after(call->ack_at, now)) { - call->ack_at = call->expire_at; - if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) - queue = true; - } else if (ktime_before(call->ack_at, t)) { - t = call->ack_at; - } - - if (!ktime_after(call->ping_at, now)) { - call->ping_at = call->expire_at; - if (!test_and_set_bit(RXRPC_CALL_EV_PING, &call->events)) - queue = true; - } else if (ktime_before(call->ping_at, t)) { - t = call->ping_at; - } - - t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now))); - t_j += jiffies; - - /* We have to make sure that the calculated jiffies value falls - * at or after the nsec value, or we may loop ceaselessly - * because the timer times out, but we haven't reached the nsec - * timeout yet. - */ - t_j++; - - if (call->timer.expires != t_j || !timer_pending(&call->timer)) { - mod_timer(&call->timer, t_j); - trace_rxrpc_timer(call, why, now, now_j); - } - } - -out: - if (queue) - rxrpc_queue_call(call); -} - -/* - * Set the timer - */ -void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, - ktime_t now) -{ - read_lock_bh(&call->state_lock); - __rxrpc_set_timer(call, why, now); - read_unlock_bh(&call->state_lock); -} - /* * Propose a PING ACK be sent. */ @@ -106,12 +32,13 @@ static void rxrpc_propose_ping(struct rxrpc_call *call, !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events)) rxrpc_queue_call(call); } else { - ktime_t now = ktime_get_real(); - ktime_t ping_at = ktime_add_ms(now, rxrpc_idle_ack_delay); + unsigned long now = jiffies; + unsigned long ping_at = now + rxrpc_idle_ack_delay; - if (ktime_before(ping_at, call->ping_at)) { - call->ping_at = ping_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_ping, now); + if (time_before(ping_at, call->ping_at)) { + WRITE_ONCE(call->ping_at, ping_at); + rxrpc_reduce_call_timer(call, ping_at, now, + rxrpc_timer_set_for_ping); } } } @@ -125,8 +52,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, enum rxrpc_propose_ack_trace why) { enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; - unsigned int expiry = rxrpc_soft_ack_delay; - ktime_t now, ack_at; + unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay; s8 prior = rxrpc_ack_priority[ack_reason]; /* Pings are handled specially because we don't want to accidentally @@ -190,11 +116,12 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, background) rxrpc_queue_call(call); } else { - now = ktime_get_real(); - ack_at = ktime_add_ms(now, expiry); - if (ktime_before(ack_at, call->ack_at)) { - call->ack_at = ack_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now); + now = jiffies; + ack_at = jiffies + expiry; + if (time_before(ack_at, call->ack_at)) { + WRITE_ONCE(call->ack_at, ack_at); + rxrpc_reduce_call_timer(call, ack_at, now, + rxrpc_timer_set_for_ack); } } @@ -227,18 +154,20 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) /* * Perform retransmission of NAK'd and unack'd packets. */ -static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) +static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; + unsigned long resend_at; rxrpc_seq_t cursor, seq, top; - ktime_t max_age, oldest, ack_ts; + ktime_t now, max_age, oldest, ack_ts; int ix; u8 annotation, anno_type, retrans = 0, unacked = 0; _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); - max_age = ktime_sub_ms(now, rxrpc_resend_timeout); + now = ktime_get_real(); + max_age = ktime_sub_ms(now, rxrpc_resend_timeout * 1000 / HZ); spin_lock_bh(&call->lock); @@ -282,7 +211,9 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } - call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout); + resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(oldest, now))); + resend_at += jiffies + rxrpc_resend_timeout; + WRITE_ONCE(call->resend_at, resend_at); if (unacked) rxrpc_congestion_timeout(call); @@ -292,7 +223,8 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) * retransmitting data. */ if (!retrans) { - rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); + rxrpc_reduce_call_timer(call, resend_at, now, + rxrpc_timer_set_for_resend); spin_unlock_bh(&call->lock); ack_ts = ktime_sub(now, call->acks_latest_ts); if (ktime_to_ns(ack_ts) < call->peer->rtt) @@ -364,7 +296,7 @@ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); - ktime_t now; + unsigned long now, next, t; rxrpc_see_call(call); @@ -384,8 +316,50 @@ recheck_state: goto out_put; } - now = ktime_get_real(); - if (ktime_before(call->expire_at, now)) { + /* Work out if any timeouts tripped */ + now = jiffies; + t = READ_ONCE(call->expect_rx_by); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now); + set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); + } + + t = READ_ONCE(call->expect_req_by); + if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST && + time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now); + set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); + } + + t = READ_ONCE(call->expect_term_by); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now); + set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); + } + + t = READ_ONCE(call->ack_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now); + cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET); + set_bit(RXRPC_CALL_EV_ACK, &call->events); + } + + t = READ_ONCE(call->ping_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now); + cmpxchg(&call->ping_at, t, now + MAX_JIFFY_OFFSET); + set_bit(RXRPC_CALL_EV_PING, &call->events); + } + + t = READ_ONCE(call->resend_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now); + cmpxchg(&call->resend_at, t, now + MAX_JIFFY_OFFSET); + set_bit(RXRPC_CALL_EV_RESEND, &call->events); + } + + /* Process events */ + if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) { rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME); set_bit(RXRPC_CALL_EV_ABORT, &call->events); goto recheck_state; @@ -408,7 +382,22 @@ recheck_state: goto recheck_state; } - rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); + /* Make sure the timer is restarted */ + next = call->expect_rx_by; + +#define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; } + + set(call->expect_req_by); + set(call->expect_term_by); + set(call->ack_at); + set(call->resend_at); + set(call->ping_at); + + now = jiffies; + if (time_after_eq(now, next)) + goto recheck_state; + + rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); /* other events may have been raised since we started checking */ if (call->events && call->state < RXRPC_CALL_COMPLETE) { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c3e1fa854471..b305970a9b63 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -51,8 +51,10 @@ static void rxrpc_call_timer_expired(unsigned long _call) _enter("%d", call->debug_id); - if (call->state < RXRPC_CALL_COMPLETE) - rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real()); + if (call->state < RXRPC_CALL_COMPLETE) { + trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); + rxrpc_queue_call(call); + } } static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; @@ -139,6 +141,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) atomic_set(&call->usage, 1); call->debug_id = atomic_inc_return(&rxrpc_debug_id); call->tx_total_len = -1; + call->next_rx_timo = 20 * HZ; + call->next_req_timo = 1 * HZ; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); @@ -189,15 +193,16 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, */ static void rxrpc_start_call_timer(struct rxrpc_call *call) { - ktime_t now = ktime_get_real(), expire_at; - - expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime); - call->expire_at = expire_at; - call->ack_at = expire_at; - call->ping_at = expire_at; - call->resend_at = expire_at; - call->timer.expires = jiffies + LONG_MAX / 2; - rxrpc_set_timer(call, rxrpc_timer_begin, now); + unsigned long now = jiffies; + unsigned long j = now + MAX_JIFFY_OFFSET; + + call->ack_at = j; + call->resend_at = j; + call->ping_at = j; + call->expect_rx_by = j; + call->expect_req_by = j; + call->expect_term_by = j; + call->timer.expires = now; } /* diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index cfb997593da9..97f6a8de4845 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -85,8 +85,8 @@ __read_mostly unsigned int rxrpc_max_client_connections = 1000; __read_mostly unsigned int rxrpc_reap_client_connections = 900; -__read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; -__read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ; +__read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; +__read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; /* * We use machine-unique IDs for our client connections. diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 1b592073ec96..c89647eae86d 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -318,16 +318,18 @@ bad_state: static bool rxrpc_receiving_reply(struct rxrpc_call *call) { struct rxrpc_ack_summary summary = { 0 }; + unsigned long now, timo; rxrpc_seq_t top = READ_ONCE(call->tx_top); if (call->ackr_reason) { spin_lock_bh(&call->lock); call->ackr_reason = 0; - call->resend_at = call->expire_at; - call->ack_at = call->expire_at; spin_unlock_bh(&call->lock); - rxrpc_set_timer(call, rxrpc_timer_init_for_reply, - ktime_get_real()); + now = jiffies; + timo = now + MAX_JIFFY_OFFSET; + WRITE_ONCE(call->resend_at, timo); + WRITE_ONCE(call->ack_at, timo); + trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now); } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) @@ -437,6 +439,19 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, if (state >= RXRPC_CALL_COMPLETE) return; + if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) { + unsigned long timo = READ_ONCE(call->next_req_timo); + unsigned long now, expect_req_by; + + if (timo) { + now = jiffies; + expect_req_by = now + timo; + WRITE_ONCE(call->expect_req_by, expect_req_by); + rxrpc_reduce_call_timer(call, expect_req_by, now, + rxrpc_timer_set_for_idle); + } + } + /* Received data implicitly ACKs all of the request packets we sent * when we're acting as a client. */ @@ -908,9 +923,20 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb, u16 skew) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned long timo; _enter("%p,%p", call, skb); + timo = READ_ONCE(call->next_rx_timo); + if (timo) { + unsigned long now = jiffies, expect_rx_by; + + expect_rx_by = jiffies + timo; + WRITE_ONCE(call->expect_rx_by, expect_rx_by); + rxrpc_reduce_call_timer(call, expect_rx_by, now, + rxrpc_timer_set_for_normal); + } + switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: rxrpc_input_data(call, skb, skew); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 1a2d4b112064..c1d9e7fd7448 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -20,34 +20,29 @@ */ unsigned int rxrpc_max_backlog __read_mostly = 10; -/* - * Maximum lifetime of a call (in mx). - */ -unsigned int rxrpc_max_call_lifetime = 60 * 1000; - /* * How long to wait before scheduling ACK generation after seeing a - * packet with RXRPC_REQUEST_ACK set (in ms). + * packet with RXRPC_REQUEST_ACK set (in jiffies). */ -unsigned int rxrpc_requested_ack_delay = 1; +unsigned long rxrpc_requested_ack_delay = 1; /* - * How long to wait before scheduling an ACK with subtype DELAY (in ms). + * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). * * We use this when we've received new data packets. If those packets aren't * all consumed within this time we will send a DELAY ACK if an ACK was not * requested to let the sender know it doesn't need to resend. */ -unsigned int rxrpc_soft_ack_delay = 1 * 1000; +unsigned long rxrpc_soft_ack_delay = HZ; /* - * How long to wait before scheduling an ACK with subtype IDLE (in ms). + * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). * * We use this when we've consumed some previously soft-ACK'd packets when * further packets aren't immediately received to decide when to send an IDLE * ACK let the other end know that it can free up its Tx buffer space. */ -unsigned int rxrpc_idle_ack_delay = 0.5 * 1000; +unsigned long rxrpc_idle_ack_delay = HZ / 2; /* * Receive window size in packets. This indicates the maximum number of @@ -75,7 +70,7 @@ unsigned int rxrpc_rx_jumbo_max = 4; /* * Time till packet resend (in milliseconds). */ -unsigned int rxrpc_resend_timeout = 4 * 1000; +unsigned long rxrpc_resend_timeout = 4 * HZ; const s8 rxrpc_ack_priority[] = { [0] = 0, diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index be0b9ae13893..0b6609da80b7 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -163,7 +163,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) case RXRPC_CALL_SERVER_RECV_REQUEST: call->tx_phase = true; call->state = RXRPC_CALL_SERVER_ACK_REQUEST; - call->ack_at = call->expire_at; + call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; write_unlock_bh(&call->state_lock); rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true, rxrpc_propose_ack_processing_op); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index de5ab327c18a..03e0676db28c 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -158,6 +158,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, rxrpc_notify_end_tx_t notify_end_tx) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned long now; rxrpc_seq_t seq = sp->hdr.seq; int ret, ix; u8 annotation = RXRPC_TX_ANNO_UNACK; @@ -197,11 +198,11 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, break; case RXRPC_CALL_SERVER_ACK_REQUEST: call->state = RXRPC_CALL_SERVER_SEND_REPLY; - call->ack_at = call->expire_at; + now = jiffies; + WRITE_ONCE(call->ack_at, now + MAX_JIFFY_OFFSET); if (call->ackr_reason == RXRPC_ACK_DELAY) call->ackr_reason = 0; - __rxrpc_set_timer(call, rxrpc_timer_init_for_send_reply, - ktime_get_real()); + trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); if (!last) break; /* Fall through */ @@ -223,14 +224,12 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, _debug("need instant resend %d", ret); rxrpc_instant_resend(call, ix); } else { - ktime_t now = ktime_get_real(), resend_at; + unsigned long now = jiffies, resend_at; - resend_at = ktime_add_ms(now, rxrpc_resend_timeout); - - if (ktime_before(resend_at, call->resend_at)) { - call->resend_at = resend_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_send, now); - } + resend_at = now + rxrpc_resend_timeout; + WRITE_ONCE(call->resend_at, resend_at); + rxrpc_reduce_call_timer(call, resend_at, now, + rxrpc_timer_set_for_send); } rxrpc_free_skb(skb, rxrpc_skb_tx_freed); @@ -513,6 +512,19 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) return -EINVAL; break; + case RXRPC_SET_CALL_TIMEOUT: + if (len & 3 || len < 4 || len > 12) + return -EINVAL; + memcpy(&p->call.timeouts, CMSG_DATA(cmsg), len); + p->call.nr_timeouts = len / 4; + if (p->call.timeouts.hard > INT_MAX / HZ) + return -ERANGE; + if (p->call.nr_timeouts >= 2 && p->call.timeouts.idle > 60 * 60 * 1000) + return -ERANGE; + if (p->call.nr_timeouts >= 3 && p->call.timeouts.normal > 60 * 60 * 1000) + return -ERANGE; + break; + default: return -EINVAL; } @@ -577,11 +589,13 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) { enum rxrpc_call_state state; struct rxrpc_call *call; + unsigned long now, j; int ret; struct rxrpc_send_params p = { .call.tx_total_len = -1, .call.user_call_ID = 0, + .call.nr_timeouts = 0, .abort_code = 0, .command = RXRPC_CMD_SEND_DATA, .exclusive = false, @@ -646,6 +660,31 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } } + switch (p.call.nr_timeouts) { + case 3: + j = msecs_to_jiffies(p.call.timeouts.normal); + if (p.call.timeouts.normal > 0 && j == 0) + j = 1; + WRITE_ONCE(call->next_rx_timo, j); + /* Fall through */ + case 2: + j = msecs_to_jiffies(p.call.timeouts.idle); + if (p.call.timeouts.idle > 0 && j == 0) + j = 1; + WRITE_ONCE(call->next_req_timo, j); + /* Fall through */ + case 1: + if (p.call.timeouts.hard > 0) { + j = msecs_to_jiffies(p.call.timeouts.hard); + now = jiffies; + j += now; + WRITE_ONCE(call->expect_term_by, j); + rxrpc_reduce_call_timer(call, j, now, + rxrpc_timer_set_for_hard); + } + break; + } + state = READ_ONCE(call->state); _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, state, call->conn); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 34c706d2f79c..4a7af7aff37d 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -21,6 +21,8 @@ static const unsigned int four = 4; static const unsigned int thirtytwo = 32; static const unsigned int n_65535 = 65535; static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1; +static const unsigned long one_jiffy = 1; +static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; /* * RxRPC operating parameters. @@ -29,64 +31,60 @@ static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1; * information on the individual parameters. */ static struct ctl_table rxrpc_sysctl_table[] = { - /* Values measured in milliseconds */ + /* Values measured in milliseconds but used in jiffies */ { .procname = "req_ack_delay", .data = &rxrpc_requested_ack_delay, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&zero, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, { .procname = "soft_ack_delay", .data = &rxrpc_soft_ack_delay, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, { .procname = "idle_ack_delay", .data = &rxrpc_idle_ack_delay, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&one, - }, - { - .procname = "resend_timeout", - .data = &rxrpc_resend_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, { .procname = "idle_conn_expiry", .data = &rxrpc_conn_idle_client_expiry, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, { .procname = "idle_conn_fast_expiry", .data = &rxrpc_conn_idle_client_fast_expiry, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, - - /* Values measured in seconds but used in jiffies */ { - .procname = "max_call_lifetime", - .data = &rxrpc_max_call_lifetime, - .maxlen = sizeof(unsigned int), + .procname = "resend_timeout", + .data = &rxrpc_resend_timeout, + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, /* Non-time values */ -- cgit v1.2.3 From 8637abaa72fe8200a4a37579e6ec5273db85d2c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:41 +0000 Subject: rxrpc: Don't transmit DELAY ACKs immediately on proposal Don't transmit a DELAY ACK immediately on proposal when the Rx window is rotated, but rather defer it to the work function. This means that we have a chance to queue/consume more received packets before we actually send the DELAY ACK, or even cancel it entirely, thereby reducing the number of packets transmitted. We do, however, want to continue sending other types of packet immediately, particularly REQUESTED ACKs, as they may be used for RTT calculation by the other side. Signed-off-by: David Howells --- net/rxrpc/recvmsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 0b6609da80b7..fad5f42a3abd 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -219,9 +219,9 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) after_eq(top, call->ackr_seen + 2) || (hard_ack == top && after(hard_ack, call->ackr_consumed))) rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, - true, false, + true, true, rxrpc_propose_ack_rotate_rx); - if (call->ackr_reason) + if (call->ackr_reason && call->ackr_reason != RXRPC_ACK_DELAY) rxrpc_send_ack_packet(call, false); } } -- cgit v1.2.3 From beb8e5e4f38cc3e4c2839cfc143e0312bf53d0e0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:41 +0000 Subject: rxrpc: Express protocol timeouts in terms of RTT Express protocol timeouts for data retransmission and deferred ack generation in terms on RTT rather than specified timeouts once we have sufficient RTT samples. For the moment, this requires just one RTT sample to be able to use this for ack deferral and two for data retransmission. The data retransmission timeout is set at RTT*1.5 and the ACK deferral timeout is set at RTT. Note that the calculated timeout is limited to a minimum of 4ns to make sure it doesn't happen too quickly. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 22 ++++++++++++++++++---- net/rxrpc/sendmsg.c | 7 +++++++ 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index c14395d5ad8c..da91f16ac77c 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -52,7 +52,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, enum rxrpc_propose_ack_trace why) { enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; - unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay; + unsigned long expiry = rxrpc_soft_ack_delay; s8 prior = rxrpc_ack_priority[ack_reason]; /* Pings are handled specially because we don't want to accidentally @@ -116,7 +116,13 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, background) rxrpc_queue_call(call); } else { - now = jiffies; + unsigned long now = jiffies, ack_at; + + if (call->peer->rtt_usage > 0) + ack_at = nsecs_to_jiffies(call->peer->rtt); + else + ack_at = expiry; + ack_at = jiffies + expiry; if (time_before(ack_at, call->ack_at)) { WRITE_ONCE(call->ack_at, ack_at); @@ -160,14 +166,22 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) struct sk_buff *skb; unsigned long resend_at; rxrpc_seq_t cursor, seq, top; - ktime_t now, max_age, oldest, ack_ts; + ktime_t now, max_age, oldest, ack_ts, timeout, min_timeo; int ix; u8 annotation, anno_type, retrans = 0, unacked = 0; _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); + if (call->peer->rtt_usage > 1) + timeout = ns_to_ktime(call->peer->rtt * 3 / 2); + else + timeout = ms_to_ktime(rxrpc_resend_timeout); + min_timeo = ns_to_ktime((1000000000 / HZ) * 4); + if (ktime_before(timeout, min_timeo)) + timeout = min_timeo; + now = ktime_get_real(); - max_age = ktime_sub_ms(now, rxrpc_resend_timeout * 1000 / HZ); + max_age = ktime_sub(now, timeout); spin_lock_bh(&call->lock); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 03e0676db28c..c56ee54fdd1f 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -226,6 +226,13 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, } else { unsigned long now = jiffies, resend_at; + if (call->peer->rtt_usage > 1) + resend_at = nsecs_to_jiffies(call->peer->rtt * 3 / 2); + else + resend_at = rxrpc_resend_timeout; + if (resend_at < 1) + resend_at = 1; + resend_at = now + rxrpc_resend_timeout; WRITE_ONCE(call->resend_at, resend_at); rxrpc_reduce_call_timer(call, resend_at, now, -- cgit v1.2.3 From bd1fdf8cfdf3fdbccd2b21c33ec649ebd7429af7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:42 +0000 Subject: rxrpc: Add a timeout for detecting lost ACKs/lost DATA Add an extra timeout that is set/updated when we send a DATA packet that has the request-ack flag set. This allows us to detect if we don't get an ACK in response to the latest flagged packet. The ACK packet is adjudged to have been lost if it doesn't turn up within 2*RTT of the transmission. If the timeout occurs, we schedule the sending of a PING ACK to find out the state of the other side. If a new DATA packet is ready to go sooner, we cancel the sending of the ping and set the request-ack flag on that instead. If we get back a PING-RESPONSE ACK that indicates a lower tx_top than what we had at the time of the ping transmission, we adjudge all the DATA packets sent between the response tx_top and the ping-time tx_top to have been lost and retransmit immediately. Rather than sending a PING ACK, we could just pick a DATA packet and speculatively retransmit that with request-ack set. It should result in either a REQUESTED ACK or a DUPLICATE ACK which we can then use in lieu the a PING-RESPONSE ACK mentioned above. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 6 +++++- net/rxrpc/call_event.c | 26 ++++++++++++++++++++++---- net/rxrpc/call_object.c | 1 + net/rxrpc/input.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/rxrpc/output.c | 20 ++++++++++++++++++-- net/rxrpc/recvmsg.c | 4 ++-- net/rxrpc/sendmsg.c | 2 +- 7 files changed, 89 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 548411371048..7e7b817c69f0 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -471,6 +471,7 @@ enum rxrpc_call_event { RXRPC_CALL_EV_RESEND, /* Tx resend required */ RXRPC_CALL_EV_PING, /* Ping send required */ RXRPC_CALL_EV_EXPIRED, /* Expiry occurred */ + RXRPC_CALL_EV_ACK_LOST, /* ACK may be lost, send ping */ }; /* @@ -515,6 +516,7 @@ struct rxrpc_call { struct rxrpc_sock __rcu *socket; /* socket responsible */ struct mutex user_mutex; /* User access mutex */ unsigned long ack_at; /* When deferred ACK needs to happen */ + unsigned long ack_lost_at; /* When ACK is figured as lost */ unsigned long resend_at; /* When next resend needs to happen */ unsigned long ping_at; /* When next to send a ping */ unsigned long expect_rx_by; /* When we expect to get a packet by */ @@ -624,6 +626,8 @@ struct rxrpc_call { ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ + rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ + rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ }; /* @@ -1011,7 +1015,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) /* * output.c */ -int rxrpc_send_ack_packet(struct rxrpc_call *, bool); +int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); void rxrpc_reject_packets(struct rxrpc_local *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index da91f16ac77c..c65666b2f39e 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -245,7 +245,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) goto out; rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, rxrpc_propose_ack_ping_for_lost_ack); - rxrpc_send_ack_packet(call, true); + rxrpc_send_ack_packet(call, true, NULL); goto out; } @@ -310,6 +310,7 @@ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); + rxrpc_serial_t *send_ack; unsigned long now, next, t; rxrpc_see_call(call); @@ -358,6 +359,13 @@ recheck_state: set_bit(RXRPC_CALL_EV_ACK, &call->events); } + t = READ_ONCE(call->ack_lost_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_lost_ack, now); + cmpxchg(&call->ack_lost_at, t, now + MAX_JIFFY_OFFSET); + set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events); + } + t = READ_ONCE(call->ping_at); if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now); @@ -379,15 +387,24 @@ recheck_state: goto recheck_state; } - if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) { + send_ack = NULL; + if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) { + call->acks_lost_top = call->tx_top; + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ack_ping_for_lost_ack); + send_ack = &call->acks_lost_ping; + } + + if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) || + send_ack) { if (call->ackr_reason) { - rxrpc_send_ack_packet(call, false); + rxrpc_send_ack_packet(call, false, send_ack); goto recheck_state; } } if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) { - rxrpc_send_ack_packet(call, true); + rxrpc_send_ack_packet(call, true, NULL); goto recheck_state; } @@ -404,6 +421,7 @@ recheck_state: set(call->expect_req_by); set(call->expect_term_by); set(call->ack_at); + set(call->ack_lost_at); set(call->resend_at); set(call->ping_at); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index b305970a9b63..7ee3d6ce5aa2 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -197,6 +197,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) unsigned long j = now + MAX_JIFFY_OFFSET; call->ack_at = j; + call->ack_lost_at = j; call->resend_at = j; call->ping_at = j; call->expect_rx_by = j; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c89647eae86d..23a5e61d8f79 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -630,6 +630,43 @@ found: orig_serial, ack_serial, sent_at, resp_time); } +/* + * Process the response to a ping that we sent to find out if we lost an ACK. + * + * If we got back a ping response that indicates a lower tx_top than what we + * had at the time of the ping transmission, we adjudge all the DATA packets + * sent between the response tx_top and the ping-time tx_top to have been lost. + */ +static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call) +{ + rxrpc_seq_t top, bottom, seq; + bool resend = false; + + spin_lock_bh(&call->lock); + + bottom = call->tx_hard_ack + 1; + top = call->acks_lost_top; + if (before(bottom, top)) { + for (seq = bottom; before_eq(seq, top); seq++) { + int ix = seq & RXRPC_RXTX_BUFF_MASK; + u8 annotation = call->rxtx_annotations[ix]; + u8 anno_type = annotation & RXRPC_TX_ANNO_MASK; + + if (anno_type != RXRPC_TX_ANNO_UNACK) + continue; + annotation &= ~RXRPC_TX_ANNO_MASK; + annotation |= RXRPC_TX_ANNO_RETRANS; + call->rxtx_annotations[ix] = annotation; + resend = true; + } + } + + spin_unlock_bh(&call->lock); + + if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_queue_call(call); +} + /* * Process a ping response. */ @@ -645,6 +682,9 @@ static void rxrpc_input_ping_response(struct rxrpc_call *call, smp_rmb(); ping_serial = call->ping_serial; + if (orig_serial == call->acks_lost_ping) + rxrpc_input_check_for_lost_ack(call); + if (!test_bit(RXRPC_CALL_PINGING, &call->flags) || before(orig_serial, ping_serial)) return; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f47659c7b224..efe06edce189 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -95,7 +95,8 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, /* * Send an ACK call packet. */ -int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping) +int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, + rxrpc_serial_t *_serial) { struct rxrpc_connection *conn = NULL; struct rxrpc_ack_buffer *pkt; @@ -165,6 +166,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping) ntohl(pkt->ack.firstPacket), ntohl(pkt->ack.serial), pkt->ack.reason, pkt->ack.nAcks); + if (_serial) + *_serial = serial; if (ping) { call->ping_serial = serial; @@ -323,7 +326,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, * ACKs if a DATA packet appears to have been lost. */ if (!(sp->hdr.flags & RXRPC_LAST_PACKET) && - (retrans || + (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || + retrans || call->cong_mode == RXRPC_CALL_SLOW_START || (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), @@ -370,6 +374,18 @@ done: if (whdr.flags & RXRPC_REQUEST_ACK) { call->peer->rtt_last_req = now; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); + if (call->peer->rtt_usage > 1) { + unsigned long nowj = jiffies, ack_lost_at; + + ack_lost_at = nsecs_to_jiffies(2 * call->peer->rtt); + if (ack_lost_at < 1) + ack_lost_at = 1; + + ack_lost_at += nowj; + WRITE_ONCE(call->ack_lost_at, ack_lost_at); + rxrpc_reduce_call_timer(call, ack_lost_at, nowj, + rxrpc_timer_set_for_lost_ack); + } } } _leave(" = %d [%u]", ret, call->peer->maxdata); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index fad5f42a3abd..cc21e8db25b0 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -148,7 +148,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false, rxrpc_propose_ack_terminal_ack); - rxrpc_send_ack_packet(call, false); + rxrpc_send_ack_packet(call, false, NULL); } #endif @@ -222,7 +222,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) true, true, rxrpc_propose_ack_rotate_rx); if (call->ackr_reason && call->ackr_reason != RXRPC_ACK_DELAY) - rxrpc_send_ack_packet(call, false); + rxrpc_send_ack_packet(call, false, NULL); } } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index c56ee54fdd1f..a1c53ac066a1 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -285,7 +285,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, do { /* Check to see if there's a ping ACK to reply to. */ if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE) - rxrpc_send_ack_packet(call, false); + rxrpc_send_ack_packet(call, false, NULL); if (!skb) { size_t size, chunk, max, space; -- cgit v1.2.3 From 415f44e43282a16ec0808c7ccfd401762e587437 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:42 +0000 Subject: rxrpc: Add keepalive for a call We need to transmit a packet every so often to act as a keepalive for the peer (which has a timeout from the last time it received a packet) and also to prevent any intervening firewalls from closing the route. Do this by resetting a timer every time we transmit a packet. If the timer ever expires, we transmit a PING ACK packet and thereby also elicit a PING RESPONSE ACK from the other side - which prevents our last-rx timeout from expiring. The timer is set to 1/6 of the last-rx timeout so that we can detect the other side going away if it misses 6 replies in a row. This is particularly necessary for servers where the processing of the service function may take a significant amount of time. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_event.c | 10 ++++++++++ net/rxrpc/output.c | 23 +++++++++++++++++++++++ 3 files changed, 34 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7e7b817c69f0..cdcbc798f921 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -519,6 +519,7 @@ struct rxrpc_call { unsigned long ack_lost_at; /* When ACK is figured as lost */ unsigned long resend_at; /* When next resend needs to happen */ unsigned long ping_at; /* When next to send a ping */ + unsigned long keepalive_at; /* When next to send a keepalive ping */ unsigned long expect_rx_by; /* When we expect to get a packet by */ unsigned long expect_req_by; /* When we expect to get a request DATA packet by */ unsigned long expect_term_by; /* When we expect call termination by */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index c65666b2f39e..bda952ffe6a6 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -366,6 +366,15 @@ recheck_state: set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events); } + t = READ_ONCE(call->keepalive_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now); + cmpxchg(&call->keepalive_at, t, now + MAX_JIFFY_OFFSET); + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, true, + rxrpc_propose_ack_ping_for_keepalive); + set_bit(RXRPC_CALL_EV_PING, &call->events); + } + t = READ_ONCE(call->ping_at); if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now); @@ -423,6 +432,7 @@ recheck_state: set(call->ack_at); set(call->ack_lost_at); set(call->resend_at); + set(call->keepalive_at); set(call->ping_at); now = jiffies; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index efe06edce189..42410e910aff 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -32,6 +32,24 @@ struct rxrpc_abort_buffer { __be32 abort_code; }; +/* + * Arrange for a keepalive ping a certain time after we last transmitted. This + * lets the far side know we're still interested in this call and helps keep + * the route through any intervening firewall open. + * + * Receiving a response to the ping will prevent the ->expect_rx_by timer from + * expiring. + */ +static void rxrpc_set_keepalive(struct rxrpc_call *call) +{ + unsigned long now = jiffies, keepalive_at = call->next_rx_timo / 6; + + keepalive_at += now; + WRITE_ONCE(call->keepalive_at, keepalive_at); + rxrpc_reduce_call_timer(call, keepalive_at, now, + rxrpc_timer_set_for_keepalive); +} + /* * Fill out an ACK packet. */ @@ -205,6 +223,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, call->ackr_seen = top; spin_unlock_bh(&call->lock); } + + rxrpc_set_keepalive(call); } out: @@ -388,6 +408,9 @@ done: } } } + + rxrpc_set_keepalive(call); + _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; -- cgit v1.2.3 From f859ab61875978eeaa539740ff7f7d91f5d60006 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:42 +0000 Subject: rxrpc: Fix service endpoint expiry RxRPC service endpoints expire like they're supposed to by the following means: (1) Mark dead rxrpc_net structs (with ->live) rather than twiddling the global service conn timeout, otherwise the first rxrpc_net struct to die will cause connections on all others to expire immediately from then on. (2) Mark local service endpoints for which the socket has been closed (->service_closed) so that the expiration timeout can be much shortened for service and client connections going through that endpoint. (3) rxrpc_put_service_conn() needs to schedule the reaper when the usage count reaches 1, not 0, as idle conns have a 1 count. (4) The accumulator for the earliest time we might want to schedule for should be initialised to jiffies + MAX_JIFFY_OFFSET, not ULONG_MAX as the comparison functions use signed arithmetic. (5) Simplify the expiration handling, adding the expiration value to the idle timestamp each time rather than keeping track of the time in the past before which the idle timestamp must go to be expired. This is much easier to read. (6) Ignore the timeouts if the net namespace is dead. (7) Restart the service reaper work item rather the client reaper. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 13 +++++++++++++ net/rxrpc/ar-internal.h | 3 +++ net/rxrpc/conn_client.c | 2 ++ net/rxrpc/conn_object.c | 42 ++++++++++++++++++++++++------------------ net/rxrpc/net_ns.c | 3 +++ 5 files changed, 45 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index c0cdcf980ffc..abb524c2b8f8 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -867,6 +867,19 @@ static int rxrpc_release_sock(struct sock *sk) sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; + /* We want to kill off all connections from a service socket + * as fast as possible because we can't share these; client + * sockets, on the other hand, can share an endpoint. + */ + switch (sk->sk_state) { + case RXRPC_SERVER_BOUND: + case RXRPC_SERVER_BOUND2: + case RXRPC_SERVER_LISTENING: + case RXRPC_SERVER_LISTEN_DISABLED: + rx->local->service_closed = true; + break; + } + spin_lock_bh(&sk->sk_receive_queue.lock); sk->sk_state = RXRPC_CLOSE; spin_unlock_bh(&sk->sk_receive_queue.lock); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index cdcbc798f921..a0082c407005 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -84,6 +84,7 @@ struct rxrpc_net { unsigned int nr_client_conns; unsigned int nr_active_client_conns; bool kill_all_client_conns; + bool live; spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */ struct list_head waiting_client_conns; @@ -265,6 +266,7 @@ struct rxrpc_local { rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ bool dead; + bool service_closed; /* Service socket closed */ struct sockaddr_rxrpc srx; /* local address */ }; @@ -881,6 +883,7 @@ void rxrpc_process_connection(struct work_struct *); * conn_object.c */ extern unsigned int rxrpc_connection_expiry; +extern unsigned int rxrpc_closed_conn_expiry; struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 97f6a8de4845..785dfdb9fef1 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -1079,6 +1079,8 @@ next: expiry = rxrpc_conn_idle_client_expiry; if (nr_conns > rxrpc_reap_client_connections) expiry = rxrpc_conn_idle_client_fast_expiry; + if (conn->params.local->service_closed) + expiry = rxrpc_closed_conn_expiry * HZ; conn_expires_at = conn->idle_timestamp + expiry; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index a01a3791f06a..726eda6140ae 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -20,7 +20,8 @@ /* * Time till a connection expires after last use (in seconds). */ -unsigned int rxrpc_connection_expiry = 10 * 60; +unsigned int __read_mostly rxrpc_connection_expiry = 10 * 60; +unsigned int __read_mostly rxrpc_closed_conn_expiry = 10; static void rxrpc_destroy_connection(struct rcu_head *); @@ -321,7 +322,7 @@ void rxrpc_put_service_conn(struct rxrpc_connection *conn) n = atomic_dec_return(&conn->usage); trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); ASSERTCMP(n, >=, 0); - if (n == 0) { + if (n == 1) { rxnet = conn->params.local->rxnet; rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, 0); } @@ -363,15 +364,14 @@ void rxrpc_service_connection_reaper(struct work_struct *work) struct rxrpc_net *rxnet = container_of(to_delayed_work(work), struct rxrpc_net, service_conn_reaper); - unsigned long reap_older_than, earliest, idle_timestamp, now; + unsigned long expire_at, earliest, idle_timestamp, now; LIST_HEAD(graveyard); _enter(""); now = jiffies; - reap_older_than = now - rxrpc_connection_expiry * HZ; - earliest = ULONG_MAX; + earliest = now + MAX_JIFFY_OFFSET; write_lock(&rxnet->conn_lock); list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) { @@ -381,15 +381,21 @@ void rxrpc_service_connection_reaper(struct work_struct *work) if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) continue; - idle_timestamp = READ_ONCE(conn->idle_timestamp); - _debug("reap CONN %d { u=%d,t=%ld }", - conn->debug_id, atomic_read(&conn->usage), - (long)reap_older_than - (long)idle_timestamp); - - if (time_after(idle_timestamp, reap_older_than)) { - if (time_before(idle_timestamp, earliest)) - earliest = idle_timestamp; - continue; + if (rxnet->live) { + idle_timestamp = READ_ONCE(conn->idle_timestamp); + expire_at = idle_timestamp + rxrpc_connection_expiry * HZ; + if (conn->params.local->service_closed) + expire_at = idle_timestamp + rxrpc_closed_conn_expiry * HZ; + + _debug("reap CONN %d { u=%d,t=%ld }", + conn->debug_id, atomic_read(&conn->usage), + (long)expire_at - (long)now); + + if (time_before(now, expire_at)) { + if (time_before(expire_at, earliest)) + earliest = expire_at; + continue; + } } /* The usage count sits at 1 whilst the object is unused on the @@ -397,6 +403,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) continue; + trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, 0); if (rxrpc_conn_is_client(conn)) BUG(); @@ -407,10 +414,10 @@ void rxrpc_service_connection_reaper(struct work_struct *work) } write_unlock(&rxnet->conn_lock); - if (earliest != ULONG_MAX) { - _debug("reschedule reaper %ld", (long) earliest - now); + if (earliest != now + MAX_JIFFY_OFFSET) { + _debug("reschedule reaper %ld", (long)earliest - (long)now); ASSERT(time_after(earliest, now)); - rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, + rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, earliest - now); } @@ -439,7 +446,6 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) rxrpc_destroy_all_client_connections(rxnet); - rxrpc_connection_expiry = 0; cancel_delayed_work(&rxnet->client_conn_reaper); rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, 0); flush_workqueue(rxrpc_workqueue); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 7edceb8522f5..684c51d600c7 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -22,6 +22,7 @@ static __net_init int rxrpc_init_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); int ret; + rxnet->live = true; get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch)); rxnet->epoch |= RXRPC_RANDOM_EPOCH; @@ -60,6 +61,7 @@ static __net_init int rxrpc_init_net(struct net *net) return 0; err_proc: + rxnet->live = false; return ret; } @@ -70,6 +72,7 @@ static __net_exit void rxrpc_exit_net(struct net *net) { struct rxrpc_net *rxnet = rxrpc_net(net); + rxnet->live = false; rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_locals(rxnet); -- cgit v1.2.3 From 3d18cbb7fd0cfdf0b2ca18139950a4b0c1a0a220 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:42 +0000 Subject: rxrpc: Fix conn expiry timers Fix the rxrpc connection expiry timers so that connections for closed AF_RXRPC sockets get deleted in a more timely fashion, freeing up the transport UDP port much more quickly. (1) Replace the delayed work items with work items plus timers so that timer_reduce() can be used to shorten them and so that the timer doesn't requeue the work item if the net namespace is dead. (2) Don't use queue_delayed_work() as that won't alter the timeout if the timer is already running. (3) Don't rearm the timers if the network namespace is dead. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 2 ++ net/rxrpc/ar-internal.h | 6 ++++-- net/rxrpc/conn_client.c | 30 +++++++++++++++++++----------- net/rxrpc/conn_object.c | 28 +++++++++++++++++----------- net/rxrpc/net_ns.c | 30 ++++++++++++++++++++++++++---- 5 files changed, 68 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index abb524c2b8f8..8f7cf4c042be 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -895,6 +895,8 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); + rxrpc_queue_work(&rx->local->rxnet->service_conn_reaper); + rxrpc_queue_work(&rx->local->rxnet->client_conn_reaper); rxrpc_put_local(rx->local); rx->local = NULL; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a0082c407005..416688381eb7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -79,7 +79,8 @@ struct rxrpc_net { struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ - struct delayed_work service_conn_reaper; + struct work_struct service_conn_reaper; + struct timer_list service_conn_reap_timer; unsigned int nr_client_conns; unsigned int nr_active_client_conns; @@ -90,7 +91,8 @@ struct rxrpc_net { struct list_head waiting_client_conns; struct list_head active_client_conns; struct list_head idle_client_conns; - struct delayed_work client_conn_reaper; + struct work_struct client_conn_reaper; + struct timer_list client_conn_reap_timer; struct list_head local_endpoints; struct mutex local_mutex; /* Lock for ->local_endpoints */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 785dfdb9fef1..7f74ca3059f8 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -691,7 +691,7 @@ int rxrpc_connect_call(struct rxrpc_call *call, _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper.work); + rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper); rxrpc_cull_active_client_conns(rxnet); ret = rxrpc_get_client_conn(call, cp, srx, gfp); @@ -756,6 +756,18 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) } } +/* + * Set the reap timer. + */ +static void rxrpc_set_client_reap_timer(struct rxrpc_net *rxnet) +{ + unsigned long now = jiffies; + unsigned long reap_at = now + rxrpc_conn_idle_client_expiry; + + if (rxnet->live) + timer_reduce(&rxnet->client_conn_reap_timer, reap_at); +} + /* * Disconnect a client call. */ @@ -896,9 +908,7 @@ idle_connection: list_move_tail(&conn->cache_link, &rxnet->idle_client_conns); if (rxnet->idle_client_conns.next == &conn->cache_link && !rxnet->kill_all_client_conns) - queue_delayed_work(rxrpc_workqueue, - &rxnet->client_conn_reaper, - rxrpc_conn_idle_client_expiry); + rxrpc_set_client_reap_timer(rxnet); } else { trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive); conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; @@ -1036,8 +1046,7 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work) { struct rxrpc_connection *conn; struct rxrpc_net *rxnet = - container_of(to_delayed_work(work), - struct rxrpc_net, client_conn_reaper); + container_of(work, struct rxrpc_net, client_conn_reaper); unsigned long expiry, conn_expires_at, now; unsigned int nr_conns; bool did_discard = false; @@ -1116,9 +1125,8 @@ not_yet_expired: */ _debug("not yet"); if (!rxnet->kill_all_client_conns) - queue_delayed_work(rxrpc_workqueue, - &rxnet->client_conn_reaper, - conn_expires_at - now); + timer_reduce(&rxnet->client_conn_reap_timer, + conn_expires_at); out: spin_unlock(&rxnet->client_conn_cache_lock); @@ -1138,9 +1146,9 @@ void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet) rxnet->kill_all_client_conns = true; spin_unlock(&rxnet->client_conn_cache_lock); - cancel_delayed_work(&rxnet->client_conn_reaper); + del_timer_sync(&rxnet->client_conn_reap_timer); - if (!queue_delayed_work(rxrpc_workqueue, &rxnet->client_conn_reaper, 0)) + if (!rxrpc_queue_work(&rxnet->client_conn_reaper)) _debug("destroy: queue failed"); _leave(""); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 726eda6140ae..1aad04a32d5e 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -310,22 +310,30 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn) return conn; } +/* + * Set the service connection reap timer. + */ +static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, + unsigned long reap_at) +{ + if (rxnet->live) + timer_reduce(&rxnet->service_conn_reap_timer, reap_at); +} + /* * Release a service connection */ void rxrpc_put_service_conn(struct rxrpc_connection *conn) { - struct rxrpc_net *rxnet; const void *here = __builtin_return_address(0); int n; n = atomic_dec_return(&conn->usage); trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); ASSERTCMP(n, >=, 0); - if (n == 1) { - rxnet = conn->params.local->rxnet; - rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, 0); - } + if (n == 1) + rxrpc_set_service_reap_timer(conn->params.local->rxnet, + jiffies + rxrpc_connection_expiry); } /* @@ -362,8 +370,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) { struct rxrpc_connection *conn, *_p; struct rxrpc_net *rxnet = - container_of(to_delayed_work(work), - struct rxrpc_net, service_conn_reaper); + container_of(work, struct rxrpc_net, service_conn_reaper); unsigned long expire_at, earliest, idle_timestamp, now; LIST_HEAD(graveyard); @@ -417,8 +424,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) if (earliest != now + MAX_JIFFY_OFFSET) { _debug("reschedule reaper %ld", (long)earliest - (long)now); ASSERT(time_after(earliest, now)); - rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, - earliest - now); + rxrpc_set_service_reap_timer(rxnet, earliest); } while (!list_empty(&graveyard)) { @@ -446,8 +452,8 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) rxrpc_destroy_all_client_connections(rxnet); - cancel_delayed_work(&rxnet->client_conn_reaper); - rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, 0); + del_timer_sync(&rxnet->service_conn_reap_timer); + rxrpc_queue_work(&rxnet->service_conn_reaper); flush_workqueue(rxrpc_workqueue); write_lock(&rxnet->conn_lock); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 684c51d600c7..f18c9248e0d4 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -14,6 +14,24 @@ unsigned int rxrpc_net_id; +static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) +{ + struct rxrpc_net *rxnet = + container_of(timer, struct rxrpc_net, client_conn_reap_timer); + + if (rxnet->live) + rxrpc_queue_work(&rxnet->client_conn_reaper); +} + +static void rxrpc_service_conn_reap_timeout(struct timer_list *timer) +{ + struct rxrpc_net *rxnet = + container_of(timer, struct rxrpc_net, service_conn_reap_timer); + + if (rxnet->live) + rxrpc_queue_work(&rxnet->service_conn_reaper); +} + /* * Initialise a per-network namespace record. */ @@ -32,8 +50,10 @@ static __net_init int rxrpc_init_net(struct net *net) INIT_LIST_HEAD(&rxnet->conn_proc_list); INIT_LIST_HEAD(&rxnet->service_conns); rwlock_init(&rxnet->conn_lock); - INIT_DELAYED_WORK(&rxnet->service_conn_reaper, - rxrpc_service_connection_reaper); + INIT_WORK(&rxnet->service_conn_reaper, + rxrpc_service_connection_reaper); + timer_setup(&rxnet->service_conn_reap_timer, + rxrpc_service_conn_reap_timeout, 0); rxnet->nr_client_conns = 0; rxnet->nr_active_client_conns = 0; @@ -43,8 +63,10 @@ static __net_init int rxrpc_init_net(struct net *net) INIT_LIST_HEAD(&rxnet->waiting_client_conns); INIT_LIST_HEAD(&rxnet->active_client_conns); INIT_LIST_HEAD(&rxnet->idle_client_conns); - INIT_DELAYED_WORK(&rxnet->client_conn_reaper, - rxrpc_discard_expired_client_conns); + INIT_WORK(&rxnet->client_conn_reaper, + rxrpc_discard_expired_client_conns); + timer_setup(&rxnet->client_conn_reap_timer, + rxrpc_client_conn_reap_timeout, 0); INIT_LIST_HEAD(&rxnet->local_endpoints); mutex_init(&rxnet->local_mutex); -- cgit v1.2.3 From 01a95b2141e337dea15ad48e60a35c72a9b10205 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Nov 2017 09:35:25 +0100 Subject: cfg80211: select CRYPTO_SHA256 if needed When regulatory database certificates are built-in, they're currently using the SHA256 digest algorithm, so add that to the build in that case. Also add a note that for custom certificates, one may need to add the right algorithms. Reported-by: Florian Fainelli Tested-by: Florian Fainelli Signed-off-by: Johannes Berg --- net/wireless/Kconfig | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index da91bb547db3..1abcc4fc4df1 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -20,6 +20,10 @@ config CFG80211 tristate "cfg80211 - wireless configuration API" depends on RFKILL || !RFKILL select FW_LOADER + # may need to update this when certificates are changed and are + # using a different algorithm, though right now they shouldn't + # (this is here rather than below to allow it to be a module) + select CRYPTO_SHA256 if CFG80211_USE_KERNEL_REGDB_KEYS ---help--- cfg80211 is the Linux wireless LAN (802.11) configuration API. Enable this if you have a wireless device. @@ -113,6 +117,9 @@ config CFG80211_EXTRA_REGDB_KEYDIR certificates like in the kernel sources (net/wireless/certs/) that shall be accepted for a signed regulatory database. + Note that you need to also select the correct CRYPTO_ modules + for your certificates, and if cfg80211 is built-in they also must be. + config CFG80211_REG_CELLULAR_HINTS bool "cfg80211 regulatory support for cellular base station hints" depends on CFG80211_CERTIFICATION_ONUS -- cgit v1.2.3 From a60b3f515d30d0fe8537c64671926879a3548103 Mon Sep 17 00:00:00 2001 From: Roman Kapl Date: Fri, 24 Nov 2017 12:27:58 +0100 Subject: net: sched: crash on blocks with goto chain action tcf_block_put_ext has assumed that all filters (and thus their goto actions) are destroyed in RCU callback and thus can not race with our list iteration. However, that is not true during netns cleanup (see tcf_exts_get_net comment). Prevent the user after free by holding all chains (except 0, that one is already held). foreach_safe is not enough in this case. To reproduce, run the following in a netns and then delete the ns: ip link add dtest type dummy tc qdisc add dev dtest ingress tc filter add dev dtest chain 1 parent ffff: handle 1 prio 1 flower action goto chain 2 Fixes: 822e86d997 ("net_sched: remove tcf_block_put_deferred()") Signed-off-by: Roman Kapl Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7d97f612c9b9..ddcf04b4ab43 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -336,7 +336,8 @@ static void tcf_block_put_final(struct work_struct *work) struct tcf_chain *chain, *tmp; rtnl_lock(); - /* Only chain 0 should be still here. */ + + /* At this point, all the chains should have refcnt == 1. */ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_put(chain); rtnl_unlock(); @@ -344,15 +345,21 @@ static void tcf_block_put_final(struct work_struct *work) } /* XXX: Standalone actions are not allowed to jump to any chain, and bound - * actions should be all removed after flushing. However, filters are now - * destroyed in tc filter workqueue with RTNL lock, they can not race here. + * actions should be all removed after flushing. */ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { - struct tcf_chain *chain, *tmp; + struct tcf_chain *chain; - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + /* Hold a refcnt for all chains, except 0, so that they don't disappear + * while we are iterating. + */ + list_for_each_entry(chain, &block->chain_list, list) + if (chain->index) + tcf_chain_hold(chain); + + list_for_each_entry(chain, &block->chain_list, list) tcf_chain_flush(chain); tcf_block_offload_unbind(block, q, ei); -- cgit v1.2.3 From afbea2cd253b5198350dfd8edb963567d05827d6 Mon Sep 17 00:00:00 2001 From: Jorgen Hansen Date: Fri, 24 Nov 2017 06:25:28 -0800 Subject: VSOCK: Don't call vsock_stream_has_data in atomic context When using the host personality, VMCI will grab a mutex for any queue pair access. In the detach callback for the vmci vsock transport, we call vsock_stream_has_data while holding a spinlock, and vsock_stream_has_data will access a queue pair. To avoid this, we can simply omit calling vsock_stream_has_data for host side queue pairs, since the QPs are empty per default when the guest has detached. This bug affects users of VMware Workstation using kernel version 4.4 and later. Testing: Ran vsock tests between guest and host, and verified that with this change, the host isn't calling vsock_stream_has_data during detach. Ran mixedTest between guest and host using both guest and host as server. v2: Rebased on top of recent change to sk_state values Reviewed-by: Adit Ranadive Reviewed-by: Aditya Sarwade Reviewed-by: Stefan Hajnoczi Signed-off-by: Jorgen Hansen Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 391775e3575c..56573dc85709 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -797,9 +797,13 @@ static void vmci_transport_handle_detach(struct sock *sk) /* We should not be sending anymore since the peer won't be * there to receive, but we can still receive if there is data - * left in our consume queue. + * left in our consume queue. If the local endpoint is a host, + * we can't call vsock_stream_has_data, since that may block, + * but a host endpoint can't read data once the VM has + * detached, so there is no available data in that case. */ - if (vsock_stream_has_data(vsk) <= 0) { + if (vsk->local_addr.svm_cid == VMADDR_CID_HOST || + vsock_stream_has_data(vsk) <= 0) { sk->sk_state = TCP_CLOSE; if (sk->sk_state == TCP_SYN_SENT) { @@ -2144,7 +2148,7 @@ module_exit(vmci_transport_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMCI transport for Virtual Sockets"); -MODULE_VERSION("1.0.4.0-k"); +MODULE_VERSION("1.0.5.0-k"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("vmware_vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK); -- cgit v1.2.3 From 9e741045faea17e28663f14a45f7f3304827c968 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 24 Nov 2017 11:36:06 -0500 Subject: net: dsa: fix 'increment on 0' warning Setting the refcount to 0 when allocating a tree to match the number of switch devices it holds may cause an 'increment on 0; use-after-free', if CONFIG_REFCOUNT_FULL is enabled. To fix this, do not decrement the refcount of a newly allocated tree, increment it when an already allocated tree is found, and decrement it after the probing of a switch, as done with the previous behavior. At the same time, make dsa_tree_get and dsa_tree_put accept a NULL argument to simplify callers, and return the tree after incrementation, as most kref users like of_node_get and of_node_put do. Fixes: 8e5bf9759a06 ("net: dsa: simplify tree reference counting") Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Tested-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 44e3fb7dec8c..1e287420ff49 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -51,9 +51,7 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) INIT_LIST_HEAD(&dst->list); list_add_tail(&dsa_tree_list, &dst->list); - /* Initialize the reference counter to the number of switches, not 1 */ kref_init(&dst->refcount); - refcount_set(&dst->refcount.refcount, 0); return dst; } @@ -64,20 +62,23 @@ static void dsa_tree_free(struct dsa_switch_tree *dst) kfree(dst); } -static struct dsa_switch_tree *dsa_tree_touch(int index) +static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst) { - struct dsa_switch_tree *dst; - - dst = dsa_tree_find(index); - if (!dst) - dst = dsa_tree_alloc(index); + if (dst) + kref_get(&dst->refcount); return dst; } -static void dsa_tree_get(struct dsa_switch_tree *dst) +static struct dsa_switch_tree *dsa_tree_touch(int index) { - kref_get(&dst->refcount); + struct dsa_switch_tree *dst; + + dst = dsa_tree_find(index); + if (dst) + return dsa_tree_get(dst); + else + return dsa_tree_alloc(index); } static void dsa_tree_release(struct kref *ref) @@ -91,7 +92,8 @@ static void dsa_tree_release(struct kref *ref) static void dsa_tree_put(struct dsa_switch_tree *dst) { - kref_put(&dst->refcount, dsa_tree_release); + if (dst) + kref_put(&dst->refcount, dsa_tree_release); } static bool dsa_port_is_dsa(struct dsa_port *port) @@ -765,6 +767,7 @@ int dsa_register_switch(struct dsa_switch *ds) mutex_lock(&dsa2_mutex); err = dsa_switch_probe(ds); + dsa_tree_put(ds->dst); mutex_unlock(&dsa2_mutex); return err; -- cgit v1.2.3 From 2734166e89639c973c6e125ac8bcfc2d9db72b70 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 25 Nov 2017 13:14:40 -0600 Subject: net: openvswitch: datapath: fix data type in queue_gso_packets gso_type is being used in binary AND operations together with SKB_GSO_UDP. The issue is that variable gso_type is of type unsigned short and SKB_GSO_UDP expands to more than 16 bits: SKB_GSO_UDP = 1 << 16 this makes any binary AND operation between gso_type and SKB_GSO_UDP to be always zero, hence making some code unreachable and likely causing undesired behavior. Fix this by changing the data type of variable gso_type to unsigned int. Addresses-Coverity-ID: 1462223 Fixes: 0c19f846d582 ("net: accept UFO datagrams from tuntap and packet") Signed-off-by: Gustavo A. R. Silva Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 99cfafc2a139..ef38e5aecd28 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -308,7 +308,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { - unsigned short gso_type = skb_shinfo(skb)->gso_type; + unsigned int gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; -- cgit v1.2.3 From 67c8d22a73128ff910e2287567132530abcf5b71 Mon Sep 17 00:00:00 2001 From: zhangliping Date: Sat, 25 Nov 2017 22:02:12 +0800 Subject: openvswitch: fix the incorrect flow action alloc size If we want to add a datapath flow, which has more than 500 vxlan outputs' action, we will get the following error reports: openvswitch: netlink: Flow action size 32832 bytes exceeds max openvswitch: netlink: Flow action size 32832 bytes exceeds max openvswitch: netlink: Actions may not be safe on all matching packets ... ... It seems that we can simply enlarge the MAX_ACTIONS_BUFSIZE to fix it, but this is not the root cause. For example, for a vxlan output action, we need about 60 bytes for the nlattr, but after it is converted to the flow action, it only occupies 24 bytes. This means that we can still support more than 1000 vxlan output actions for a single datapath flow under the the current 32k max limitation. So even if the nla_len(attr) is larger than MAX_ACTIONS_BUFSIZE, we shouldn't report EINVAL and keep it move on, as the judgement can be done by the reserve_sfa_size. Signed-off-by: zhangliping Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index dc424798ba6f..624ea74353dd 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2241,14 +2241,11 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) #define MAX_ACTIONS_BUFSIZE (32 * 1024) -static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) +static struct sw_flow_actions *nla_alloc_flow_actions(int size) { struct sw_flow_actions *sfa; - if (size > MAX_ACTIONS_BUFSIZE) { - OVS_NLERR(log, "Flow action size %u bytes exceeds max", size); - return ERR_PTR(-EINVAL); - } + WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE); sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); if (!sfa) @@ -2321,12 +2318,15 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = ksize(*sfa) * 2; if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) + if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + OVS_NLERR(log, "Flow action size exceeds max %u", + MAX_ACTIONS_BUFSIZE); return ERR_PTR(-EMSGSIZE); + } new_acts_size = MAX_ACTIONS_BUFSIZE; } - acts = nla_alloc_flow_actions(new_acts_size, log); + acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) return (void *)acts; @@ -3059,7 +3059,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, { int err; - *sfa = nla_alloc_flow_actions(nla_len(attr), log); + *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); -- cgit v1.2.3 From 7b6ddeaf27eca72795ceeae2f0f347db1b5f9a30 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 21 Nov 2017 14:46:08 +0100 Subject: mac80211: use QoS NDP for AP probing When connected to a QoS/WMM AP, mac80211 should use a QoS NDP for probing it, instead of a regular non-QoS one, fix this. Change all the drivers to *not* allow QoS NDP for now, even though it looks like most of them should be OK with that. Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- net/mac80211/tx.c | 29 +++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 04460440d731..c244691deab9 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -895,7 +895,7 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif); + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, true); if (!skb) return; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 7b8154474b9e..3160954fc406 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4438,13 +4438,15 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, EXPORT_SYMBOL(ieee80211_pspoll_get); struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, - struct ieee80211_vif *vif) + struct ieee80211_vif *vif, + bool qos_ok) { struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_sub_if_data *sdata; struct ieee80211_if_managed *ifmgd; struct ieee80211_local *local; struct sk_buff *skb; + bool qos = false; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return NULL; @@ -4453,7 +4455,17 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, ifmgd = &sdata->u.mgd; local = sdata->local; - skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc)); + if (qos_ok) { + struct sta_info *sta; + + rcu_read_lock(); + sta = sta_info_get(sdata, ifmgd->bssid); + qos = sta && sta->sta.wme; + rcu_read_unlock(); + } + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + sizeof(*nullfunc) + 2); if (!skb) return NULL; @@ -4463,6 +4475,19 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_TODS); + if (qos) { + __le16 qos = cpu_to_le16(7); + + BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC | + IEEE80211_STYPE_NULLFUNC) != + IEEE80211_STYPE_QOS_NULLFUNC); + nullfunc->frame_control |= + cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC); + skb->priority = 7; + skb_set_queue_mapping(skb, IEEE80211_AC_VO); + skb_put_data(skb, &qos, sizeof(qos)); + } + memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN); memcpy(nullfunc->addr2, vif->addr, ETH_ALEN); memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN); -- cgit v1.2.3 From fbbdad5edf0bb59786a51b94a9d006bc8c2da9a2 Mon Sep 17 00:00:00 2001 From: Chun-Yeow Yeoh Date: Tue, 14 Nov 2017 23:20:05 +0800 Subject: mac80211: fix the update of path metric for RANN frame The previous path metric update from RANN frame has not considered the own link metric toward the transmitting mesh STA. Fix this. Reported-by: Michael65535 Signed-off-by: Chun-Yeow Yeoh Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 4f7826d7b47c..4394463a0c2e 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -797,7 +797,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, struct mesh_path *mpath; u8 ttl, flags, hopcount; const u8 *orig_addr; - u32 orig_sn, metric, metric_txsta, interval; + u32 orig_sn, new_metric, orig_metric, last_hop_metric, interval; bool root_is_gate; ttl = rann->rann_ttl; @@ -808,7 +808,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, interval = le32_to_cpu(rann->rann_interval); hopcount = rann->rann_hopcount; hopcount++; - metric = le32_to_cpu(rann->rann_metric); + orig_metric = le32_to_cpu(rann->rann_metric); /* Ignore our own RANNs */ if (ether_addr_equal(orig_addr, sdata->vif.addr)) @@ -825,7 +825,10 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, return; } - metric_txsta = airtime_link_metric_get(local, sta); + last_hop_metric = airtime_link_metric_get(local, sta); + new_metric = orig_metric + last_hop_metric; + if (new_metric < orig_metric) + new_metric = MAX_METRIC; mpath = mesh_path_lookup(sdata, orig_addr); if (!mpath) { @@ -838,7 +841,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, } if (!(SN_LT(mpath->sn, orig_sn)) && - !(mpath->sn == orig_sn && metric < mpath->rann_metric)) { + !(mpath->sn == orig_sn && new_metric < mpath->rann_metric)) { rcu_read_unlock(); return; } @@ -856,7 +859,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, } mpath->sn = orig_sn; - mpath->rann_metric = metric + metric_txsta; + mpath->rann_metric = new_metric; mpath->is_root = true; /* Recording RANNs sender address to send individually * addressed PREQs destined for root mesh STA */ @@ -876,7 +879,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr, orig_sn, 0, NULL, 0, broadcast_addr, hopcount, ttl, interval, - metric + metric_txsta, 0, sdata); + new_metric, 0, sdata); } rcu_read_unlock(); -- cgit v1.2.3 From 72e2c3438ba3bd2ed640b6b5ea9e58993dd9ab7f Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Sun, 29 Oct 2017 11:51:09 +0200 Subject: mac80211: tear down RX aggregations first When doing HW restart we tear down aggregations. Since at this point we are not TX'ing any aggregation, while the peer is still sending RX aggregation over the air, it will make sense to tear down the RX aggregations first. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 41f5e48f8021..167f83b853e6 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -292,7 +292,6 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, mutex_lock(&sta->ampdu_mlme.mtx); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - ___ieee80211_stop_tx_ba_session(sta, i, reason); ___ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_LEAVE_QBSS, reason != AGG_STOP_DESTROY_STA && @@ -300,6 +299,9 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, } mutex_unlock(&sta->ampdu_mlme.mtx); + for (i = 0; i < IEEE80211_NUM_TIDS; i++) + ___ieee80211_stop_tx_ba_session(sta, i, reason); + /* stopping might queue the work again - so cancel only afterwards */ cancel_work_sync(&sta->ampdu_mlme.work); -- cgit v1.2.3 From 3aa623da789ff6edf789b5655debf33f50b3a9b2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:05:32 +0800 Subject: sctp: use sizeof(__u16) for each stream number length instead of magic number Now in stream reconf part there are still some places using magic number 2 for each stream number length. To make it more readable, this patch is to replace them with sizeof(__u16). Reported-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index a11db21dc8a0..09c797a10aaa 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -563,7 +563,7 @@ struct sctp_chunk *sctp_process_strreset_outreq( flags = SCTP_STREAM_RESET_INCOMING_SSN; } - nums = (ntohs(param.p->length) - sizeof(*outreq)) / 2; + nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16); if (nums) { str_p = outreq->list_of_streams; for (i = 0; i < nums; i++) { @@ -627,7 +627,7 @@ struct sctp_chunk *sctp_process_strreset_inreq( goto out; } - nums = (ntohs(param.p->length) - sizeof(*inreq)) / 2; + nums = (ntohs(param.p->length) - sizeof(*inreq)) / sizeof(__u16); str_p = inreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->outcnt) { @@ -927,7 +927,8 @@ struct sctp_chunk *sctp_process_strreset_resp( outreq = (struct sctp_strreset_outreq *)req; str_p = outreq->list_of_streams; - nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / 2; + nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / + sizeof(__u16); if (result == SCTP_STRRESET_PERFORMED) { if (nums) { @@ -956,7 +957,8 @@ struct sctp_chunk *sctp_process_strreset_resp( inreq = (struct sctp_strreset_inreq *)req; str_p = inreq->list_of_streams; - nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / 2; + nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / + sizeof(__u16); *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); -- cgit v1.2.3 From d570a59c5b5f8fb3b65fa7b15ddb205a1d55f8d0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:05:33 +0800 Subject: sctp: only allow the out stream reset when the stream outq is empty Now the out stream reset in sctp stream reconf could be done even if the stream outq is not empty. It means that users can not be sure since which msg the new ssn will be used. To make this more synchronous, it shouldn't allow to do out stream reset until these chunks in unsent outq all are sent out. This patch checks the corresponding stream outqs when sending and processing the request . If any of them has unsent chunks in outq, it will return -EAGAIN instead or send SCTP_STRRESET_IN_PROGRESS back to the sender. Fixes: 7f9d68ac944e ("sctp: implement sender-side procedures for SSN Reset Request Parameter") Suggested-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 09c797a10aaa..b20903712d67 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -254,6 +254,30 @@ static int sctp_send_reconf(struct sctp_association *asoc, return retval; } +static bool sctp_stream_outq_is_empty(struct sctp_stream *stream, + __u16 str_nums, __be16 *str_list) +{ + struct sctp_association *asoc; + __u16 i; + + asoc = container_of(stream, struct sctp_association, stream); + if (!asoc->outqueue.out_qlen) + return true; + + if (!str_nums) + return false; + + for (i = 0; i < str_nums; i++) { + __u16 sid = ntohs(str_list[i]); + + if (stream->out[sid].ext && + !list_empty(&stream->out[sid].ext->outq)) + return false; + } + + return true; +} + int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params) { @@ -317,6 +341,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc, for (i = 0; i < str_nums; i++) nstr_list[i] = htons(str_list[i]); + if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) { + retval = -EAGAIN; + goto out; + } + chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in); kfree(nstr_list); @@ -636,6 +665,12 @@ struct sctp_chunk *sctp_process_strreset_inreq( } } + if (!sctp_stream_outq_is_empty(stream, nums, str_p)) { + result = SCTP_STRRESET_IN_PROGRESS; + asoc->strreset_inseq--; + goto err; + } + chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0); if (!chunk) goto out; -- cgit v1.2.3 From 5c6144a0eb5366ae07fc5059301b139338f39bbd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:05:34 +0800 Subject: sctp: only allow the asoc reset when the asoc outq is empty As it says in rfc6525#section5.1.4, before sending the request, C2: The sender has either no outstanding TSNs or considers all outstanding TSNs abandoned. Prior to this patch, it tried to consider all outstanding TSNs abandoned by dropping all chunks in all outqs with sctp_outq_free (even including sacked, retransmit and transmitted queues) when doing this reset, which is too aggressive. To make it work gently, this patch will only allow the asoc reset when the sender has no outstanding TSNs by checking if unsent, transmitted and retransmit are all empty with sctp_outq_is_empty before sending and processing the request. Fixes: 692787cef651 ("sctp: implement receiver-side procedures for the SSN/TSN Reset Request Parameter") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index b20903712d67..f3b7d2779c18 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -406,6 +406,9 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) if (asoc->strreset_outstanding) return -EINPROGRESS; + if (!sctp_outq_is_empty(&asoc->outqueue)) + return -EAGAIN; + chunk = sctp_make_strreset_tsnreq(asoc); if (!chunk) return -ENOMEM; @@ -728,6 +731,12 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( } goto err; } + + if (!sctp_outq_is_empty(&asoc->outqueue)) { + result = SCTP_STRRESET_IN_PROGRESS; + goto err; + } + asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) -- cgit v1.2.3 From 159f2a7456c6ae95c1e1a58e8b8ec65ef12d51cf Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:05:35 +0800 Subject: sctp: avoid flushing unsent queue when doing asoc reset Now when doing asoc reset, it cleans up sacked and abandoned queues by calling sctp_outq_free where it also cleans up unsent, retransmit and transmitted queues. It's safe for the sender of response, as these 3 queues are empty at that time. But when the receiver of response is doing the reset, the users may already enqueue some chunks into unsent during the time waiting the response, and these chunks should not be flushed. To void the chunks in it would be removed, it moves the queue into a temp list, then gets it back after sctp_outq_free is done. The patch also fixes some incorrect comments in sctp_process_strreset_tsnreq. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index f3b7d2779c18..9dd5bfe3860c 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -747,9 +747,10 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( goto out; } - /* G3: The same processing as though a SACK chunk with no gap report - * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were - * received MUST be performed. + /* G4: The same processing as though a FWD-TSN chunk (as defined in + * [RFC3758]) with all streams affected and a new cumulative TSN + * ACK of the Receiver's Next TSN minus 1 were received MUST be + * performed. */ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); sctp_ulpq_reasm_flushtsn(&asoc->ulpq, max_tsn_seen); @@ -764,10 +765,9 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); - /* G4: The same processing as though a FWD-TSN chunk (as defined in - * [RFC3758]) with all streams affected and a new cumulative TSN - * ACK of the Receiver's Next TSN minus 1 were received MUST be - * performed. + /* G3: The same processing as though a SACK chunk with no gap report + * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were + * received MUST be performed. */ sctp_outq_free(&asoc->outqueue); @@ -1021,6 +1021,7 @@ struct sctp_chunk *sctp_process_strreset_resp( if (result == SCTP_STRRESET_PERFORMED) { __u32 mtsn = sctp_tsnmap_get_max_tsn_seen( &asoc->peer.tsn_map); + LIST_HEAD(temp); sctp_ulpq_reasm_flushtsn(&asoc->ulpq, mtsn); sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); @@ -1029,7 +1030,13 @@ struct sctp_chunk *sctp_process_strreset_resp( SCTP_TSN_MAP_INITIAL, stsn, GFP_ATOMIC); + /* Clean up sacked and abandoned queues only. As the + * out_chunk_list may not be empty, splice it to temp, + * then get it back after sctp_outq_free is done. + */ + list_splice_init(&asoc->outqueue.out_chunk_list, &temp); sctp_outq_free(&asoc->outqueue); + list_splice_init(&temp, &asoc->outqueue.out_chunk_list); asoc->next_tsn = rtsn; asoc->ctsn_ack_point = asoc->next_tsn - 1; -- cgit v1.2.3 From 52a395896a051a3d5c34fba67c324f69ec5e67c6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:05:36 +0800 Subject: sctp: set sender next_tsn for the old result with ctsn_ack_point plus 1 When doing asoc reset, if the sender of the response has already sent some chunk and increased asoc->next_tsn before the duplicate request comes, the response will use the old result with an incorrect sender next_tsn. Better than asoc->next_tsn, asoc->ctsn_ack_point can't be changed after the sender of the response has performed the asoc reset and before the peer has confirmed it, and it's value is still asoc->next_tsn original value minus 1. This patch sets sender next_tsn for the old result with ctsn_ack_point plus 1 when processing the duplicate request, to make sure the sender next_tsn value peer gets will be always right. Fixes: 692787cef651 ("sctp: implement receiver-side procedures for the SSN/TSN Reset Request Parameter") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 9dd5bfe3860c..a20145b3a949 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -725,7 +725,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) { - next_tsn = asoc->next_tsn; + next_tsn = asoc->ctsn_ack_point + 1; init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1; } -- cgit v1.2.3 From 2e724dca7749223204bbae21745c0e3fc932700a Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 27 Nov 2017 20:13:39 +0100 Subject: tipc: eliminate access after delete in group_filter_msg() KASAN revealed another access after delete in group.c. This time it found that we read the header of a received message after the buffer has been released. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 12777cac638a..95fec2c057d6 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -497,6 +497,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, while ((skb = skb_peek(defq))) { hdr = buf_msg(skb); mtyp = msg_type(hdr); + blks = msg_blocks(hdr); deliver = true; ack = false; update = false; @@ -546,7 +547,6 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, if (!update) continue; - blks = msg_blocks(hdr); tipc_group_update_rcv_win(grp, blks, node, port, xmitq); } return; -- cgit v1.2.3 From d34971a65b72619508f49cd237283e92f1c329d5 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Tue, 17 Oct 2017 18:14:23 +0200 Subject: sunrpc: make the function arg as const Make the struct cache_detail *tmpl argument of the function cache_create_net as const as it is only getting passed to kmemup having the argument as const void *. Add const to the prototype too. Signed-off-by: Bhumika Goyal Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 79d55d949d9a..e68943895be4 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1674,7 +1674,7 @@ void cache_unregister_net(struct cache_detail *cd, struct net *net) } EXPORT_SYMBOL_GPL(cache_unregister_net); -struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net) +struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct net *net) { struct cache_detail *cd; int i; -- cgit v1.2.3 From ee24eac3ebb781c12a654985e33ecaa07f4d0f95 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Tue, 17 Oct 2017 18:14:26 +0200 Subject: SUNRPC: make cache_detail structures const Make these const as they are only getting passed to the function cache_create_net having the argument as const. Signed-off-by: Bhumika Goyal Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 4 ++-- net/sunrpc/svcauth_unix.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 73165e9ca5bf..5dd4e6c9fef2 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -264,7 +264,7 @@ out: return status; } -static struct cache_detail rsi_cache_template = { +static const struct cache_detail rsi_cache_template = { .owner = THIS_MODULE, .hash_size = RSI_HASHMAX, .name = "auth.rpcsec.init", @@ -524,7 +524,7 @@ out: return status; } -static struct cache_detail rsc_cache_template = { +static const struct cache_detail rsc_cache_template = { .owner = THIS_MODULE, .hash_size = RSC_HASHMAX, .name = "auth.rpcsec.context", diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index f81eaa8e0888..740b67d5a733 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -569,7 +569,7 @@ static int unix_gid_show(struct seq_file *m, return 0; } -static struct cache_detail unix_gid_cache_template = { +static const struct cache_detail unix_gid_cache_template = { .owner = THIS_MODULE, .hash_size = GID_HASHMAX, .name = "auth.unix.gid", @@ -862,7 +862,7 @@ struct auth_ops svcauth_unix = { .set_client = svcauth_unix_set_client, }; -static struct cache_detail ip_map_cache_template = { +static const struct cache_detail ip_map_cache_template = { .owner = THIS_MODULE, .hash_size = IP_HASHMAX, .name = "auth.unix.ip", -- cgit v1.2.3 From 8b1836c4b64386e9bc580438cae386ed31a43ab9 Mon Sep 17 00:00:00 2001 From: Jay Elliott Date: Wed, 15 Nov 2017 15:01:13 -0800 Subject: netfilter: conntrack: clamp timeouts to INT_MAX When the conntracking code multiplies a timeout by HZ, it can overflow from positive to negative; this causes it to instantly expire. To protect against this the multiplication is done in 64-bit so we can prevent it from exceeding INT_MAX. Signed-off-by: Jay Elliott Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 59c08997bfdf..66d72a8fa87f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1566,9 +1566,11 @@ static int ctnetlink_change_helper(struct nf_conn *ct, static int ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) { - u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); + u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - ct->timeout = nfct_time_stamp + timeout * HZ; + if (timeout > INT_MAX) + timeout = INT_MAX; + ct->timeout = nfct_time_stamp + (u32)timeout; if (test_bit(IPS_DYING_BIT, &ct->status)) return -ETIME; @@ -1768,6 +1770,7 @@ ctnetlink_create_conntrack(struct net *net, int err = -EINVAL; struct nf_conntrack_helper *helper; struct nf_conn_tstamp *tstamp; + u64 timeout; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) @@ -1776,7 +1779,10 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + if (timeout > INT_MAX) + timeout = INT_MAX; + ct->timeout = (u32)timeout + nfct_time_stamp; rcu_read_lock(); if (cda[CTA_HELP]) { -- cgit v1.2.3 From 08f46070dde2a835a7a5bfd4c70372c27bff5d88 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 26 Nov 2017 20:16:06 +0800 Subject: sctp: force SCTP_ERROR_INV_STRM with __u32 when calling sctp_chunk_fail This patch is to force SCTP_ERROR_INV_STRM with right type to fit in sctp_chunk_fail to avoid the sparse error. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index a20145b3a949..76ea66be0bbe 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -64,7 +64,7 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream, */ /* Mark as failed send. */ - sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM); + sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; -- cgit v1.2.3 From 1ba896f6f52bfafac6dec4ca583cdd9a073858e8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 26 Nov 2017 20:16:08 +0800 Subject: sctp: remove extern from stream sched Now each stream sched ops is defined in different .c file and added into the global ops in another .c file, it uses extern to make this work. However extern is not good coding style to get them in and even make C=2 reports errors for this. This patch adds sctp_sched_ops_xxx_init for each stream sched ops in their .c file, then get them into the global ops by calling them when initializing sctp module. Fixes: 637784ade221 ("sctp: introduce priority based stream scheduler") Fixes: ac1ed8b82cd6 ("sctp: introduce round robin stream scheduler") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/protocol.c | 1 + net/sctp/stream_sched.c | 25 ++++++++++++++++++------- net/sctp/stream_sched_prio.c | 7 ++++++- net/sctp/stream_sched_rr.c | 7 ++++++- 4 files changed, 31 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f5172c21349b..6a38c2503649 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1499,6 +1499,7 @@ static __init int sctp_init(void) INIT_LIST_HEAD(&sctp_address_families); sctp_v4_pf_init(); sctp_v6_pf_init(); + sctp_sched_ops_init(); status = register_pernet_subsys(&sctp_defaults_ops); if (status) diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 0b83ec51e43b..d8c162a4089c 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -119,16 +119,27 @@ static struct sctp_sched_ops sctp_sched_fcfs = { .unsched_all = sctp_sched_fcfs_unsched_all, }; +static void sctp_sched_ops_fcfs_init(void) +{ + sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs); +} + /* API to other parts of the stack */ -extern struct sctp_sched_ops sctp_sched_prio; -extern struct sctp_sched_ops sctp_sched_rr; +static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; -static struct sctp_sched_ops *sctp_sched_ops[] = { - &sctp_sched_fcfs, - &sctp_sched_prio, - &sctp_sched_rr, -}; +void sctp_sched_ops_register(enum sctp_sched_type sched, + struct sctp_sched_ops *sched_ops) +{ + sctp_sched_ops[sched] = sched_ops; +} + +void sctp_sched_ops_init(void) +{ + sctp_sched_ops_fcfs_init(); + sctp_sched_ops_prio_init(); + sctp_sched_ops_rr_init(); +} int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 384dbf3c8760..7997d35dd0fd 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -333,7 +333,7 @@ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) sctp_sched_prio_unsched(soute); } -struct sctp_sched_ops sctp_sched_prio = { +static struct sctp_sched_ops sctp_sched_prio = { .set = sctp_sched_prio_set, .get = sctp_sched_prio_get, .init = sctp_sched_prio_init, @@ -345,3 +345,8 @@ struct sctp_sched_ops sctp_sched_prio = { .sched_all = sctp_sched_prio_sched_all, .unsched_all = sctp_sched_prio_unsched_all, }; + +void sctp_sched_ops_prio_init(void) +{ + sctp_sched_ops_register(SCTP_SS_PRIO, &sctp_sched_prio); +} diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c index 7612a438c5b9..1155692448f1 100644 --- a/net/sctp/stream_sched_rr.c +++ b/net/sctp/stream_sched_rr.c @@ -187,7 +187,7 @@ static void sctp_sched_rr_unsched_all(struct sctp_stream *stream) sctp_sched_rr_unsched(stream, soute); } -struct sctp_sched_ops sctp_sched_rr = { +static struct sctp_sched_ops sctp_sched_rr = { .set = sctp_sched_rr_set, .get = sctp_sched_rr_get, .init = sctp_sched_rr_init, @@ -199,3 +199,8 @@ struct sctp_sched_ops sctp_sched_rr = { .sched_all = sctp_sched_rr_sched_all, .unsched_all = sctp_sched_rr_unsched_all, }; + +void sctp_sched_ops_rr_init(void) +{ + sctp_sched_ops_register(SCTP_SS_RR, &sctp_sched_rr); +} -- cgit v1.2.3 From 57f015f5eccf25fd4a3336fe3cbbee920a8fba6f Mon Sep 17 00:00:00 2001 From: Mike Maloney Date: Tue, 28 Nov 2017 10:44:29 -0500 Subject: packet: fix crash in fanout_demux_rollover() syzkaller found a race condition fanout_demux_rollover() while removing a packet socket from a fanout group. po->rollover is read and operated on during packet_rcv_fanout(), via fanout_demux_rollover(), but the pointer is currently cleared before the synchronization in packet_release(). It is safer to delay the cleanup until after synchronize_net() has been called, ensuring all calls to packet_rcv_fanout() for this socket have finished. To further simplify synchronization around the rollover structure, set po->rollover in fanout_add() only if there are no errors. This removes the need for rcu in the struct and in the call to packet_getsockopt(..., PACKET_ROLLOVER_STATS, ...). Crashing stack trace: fanout_demux_rollover+0xb6/0x4d0 net/packet/af_packet.c:1392 packet_rcv_fanout+0x649/0x7c8 net/packet/af_packet.c:1487 dev_queue_xmit_nit+0x835/0xc10 net/core/dev.c:1953 xmit_one net/core/dev.c:2975 [inline] dev_hard_start_xmit+0x16b/0xac0 net/core/dev.c:2995 __dev_queue_xmit+0x17a4/0x2050 net/core/dev.c:3476 dev_queue_xmit+0x17/0x20 net/core/dev.c:3509 neigh_connected_output+0x489/0x720 net/core/neighbour.c:1379 neigh_output include/net/neighbour.h:482 [inline] ip6_finish_output2+0xad1/0x22a0 net/ipv6/ip6_output.c:120 ip6_finish_output+0x2f9/0x920 net/ipv6/ip6_output.c:146 NF_HOOK_COND include/linux/netfilter.h:239 [inline] ip6_output+0x1f4/0x850 net/ipv6/ip6_output.c:163 dst_output include/net/dst.h:459 [inline] NF_HOOK.constprop.35+0xff/0x630 include/linux/netfilter.h:250 mld_sendpack+0x6a8/0xcc0 net/ipv6/mcast.c:1660 mld_send_initial_cr.part.24+0x103/0x150 net/ipv6/mcast.c:2072 mld_send_initial_cr net/ipv6/mcast.c:2056 [inline] ipv6_mc_dad_complete+0x99/0x130 net/ipv6/mcast.c:2079 addrconf_dad_completed+0x595/0x970 net/ipv6/addrconf.c:4039 addrconf_dad_work+0xac9/0x1160 net/ipv6/addrconf.c:3971 process_one_work+0xbf0/0x1bc0 kernel/workqueue.c:2113 worker_thread+0x223/0x1990 kernel/workqueue.c:2247 kthread+0x35e/0x430 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:432 Fixes: 0648ab70afe6 ("packet: rollover prepare: per-socket state") Fixes: 509c7a1ecc860 ("packet: avoid panic in packet_getsockopt()") Reported-by: syzbot Signed-off-by: Mike Maloney Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 32 ++++++++++---------------------- net/packet/internal.h | 1 - 2 files changed, 10 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 737092ca9b4e..1b7bb9d9865e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1687,7 +1687,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) atomic_long_set(&rollover->num, 0); atomic_long_set(&rollover->num_huge, 0); atomic_long_set(&rollover->num_failed, 0); - po->rollover = rollover; } if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { @@ -1745,6 +1744,8 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) { __dev_remove_pack(&po->prot_hook); po->fanout = match; + po->rollover = rollover; + rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); __fanout_link(sk, po); err = 0; @@ -1758,10 +1759,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } out: - if (err && rollover) { - kfree_rcu(rollover, rcu); - po->rollover = NULL; - } + kfree(rollover); mutex_unlock(&fanout_mutex); return err; } @@ -1785,11 +1783,6 @@ static struct packet_fanout *fanout_release(struct sock *sk) list_del(&f->list); else f = NULL; - - if (po->rollover) { - kfree_rcu(po->rollover, rcu); - po->rollover = NULL; - } } mutex_unlock(&fanout_mutex); @@ -3029,6 +3022,7 @@ static int packet_release(struct socket *sock) synchronize_net(); if (f) { + kfree(po->rollover); fanout_release_data(f); kfree(f); } @@ -3843,7 +3837,6 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; - struct packet_rollover *rollover; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3922,18 +3915,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, 0); break; case PACKET_ROLLOVER_STATS: - rcu_read_lock(); - rollover = rcu_dereference(po->rollover); - if (rollover) { - rstats.tp_all = atomic_long_read(&rollover->num); - rstats.tp_huge = atomic_long_read(&rollover->num_huge); - rstats.tp_failed = atomic_long_read(&rollover->num_failed); - data = &rstats; - lv = sizeof(rstats); - } - rcu_read_unlock(); - if (!rollover) + if (!po->rollover) return -EINVAL; + rstats.tp_all = atomic_long_read(&po->rollover->num); + rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); + rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); + data = &rstats; + lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; diff --git a/net/packet/internal.h b/net/packet/internal.h index 562fbc155006..a1d2b2319ae9 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -95,7 +95,6 @@ struct packet_fanout { struct packet_rollover { int sock; - struct rcu_head rcu; atomic_long_t num; atomic_long_t num_huge; atomic_long_t num_failed; -- cgit v1.2.3 From 15fe076edea787807a7cdc168df832544b58eba6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Nov 2017 08:03:30 -0800 Subject: net/packet: fix a race in packet_bind() and packet_notifier() syzbot reported crashes [1] and provided a C repro easing bug hunting. When/if packet_do_bind() calls __unregister_prot_hook() and releases po->bind_lock, another thread can run packet_notifier() and process an NETDEV_UP event. This calls register_prot_hook() and hooks again the socket right before first thread is able to grab again po->bind_lock. Fixes this issue by temporarily setting po->num to 0, as suggested by David Miller. [1] dev_remove_pack: ffff8801bf16fa80 not found ------------[ cut here ]------------ kernel BUG at net/core/dev.c:7945! ( BUG_ON(!list_empty(&dev->ptype_all)); ) invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: device syz0 entered promiscuous mode CPU: 0 PID: 3161 Comm: syzkaller404108 Not tainted 4.14.0+ #190 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801cc57a500 task.stack: ffff8801cc588000 RIP: 0010:netdev_run_todo+0x772/0xae0 net/core/dev.c:7945 RSP: 0018:ffff8801cc58f598 EFLAGS: 00010293 RAX: ffff8801cc57a500 RBX: dffffc0000000000 RCX: ffffffff841f75b2 RDX: 0000000000000000 RSI: 1ffff100398b1ede RDI: ffff8801bf1f8810 device syz0 entered promiscuous mode RBP: ffff8801cc58f898 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801bf1f8cd8 R13: ffff8801cc58f870 R14: ffff8801bf1f8780 R15: ffff8801cc58f7f0 FS: 0000000001716880(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020b13000 CR3: 0000000005e25000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rtnl_unlock+0xe/0x10 net/core/rtnetlink.c:106 tun_detach drivers/net/tun.c:670 [inline] tun_chr_close+0x49/0x60 drivers/net/tun.c:2845 __fput+0x333/0x7f0 fs/file_table.c:210 ____fput+0x15/0x20 fs/file_table.c:244 task_work_run+0x199/0x270 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x9bb/0x1ae0 kernel/exit.c:865 do_group_exit+0x149/0x400 kernel/exit.c:968 SYSC_exit_group kernel/exit.c:979 [inline] SyS_exit_group+0x1d/0x20 kernel/exit.c:977 entry_SYSCALL_64_fastpath+0x1f/0x96 RIP: 0033:0x44ad19 Fixes: 30f7ea1c2b5f ("packet: race condition in packet_bind") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Francesco Ruggeri Signed-off-by: David S. Miller --- net/packet/af_packet.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1b7bb9d9865e..da215e5c1399 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3091,6 +3091,10 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, if (need_rehook) { if (po->running) { rcu_read_unlock(); + /* prevents packet_notifier() from calling + * register_prot_hook() + */ + po->num = 0; __unregister_prot_hook(sk, true); rcu_read_lock(); dev_curr = po->prot_hook.dev; @@ -3099,6 +3103,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, dev->ifindex); } + BUG_ON(po->running); po->num = proto; po->prot_hook.type = proto; -- cgit v1.2.3 From 25415cec502a1232b19fffc85465882b19a90415 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Nov 2017 11:11:41 -0800 Subject: cls_bpf: don't decrement net's refcount when offload fails When cls_bpf offload was added it seemed like a good idea to call cls_bpf_delete_prog() instead of extending the error handling path, since the software state is fully initialized at that point. This handling of errors without jumping to the end of the function is error prone, as proven by later commit missing that extra call to __cls_bpf_delete_prog(). __cls_bpf_delete_prog() is now expected to be invoked with a reference on exts->net or the field zeroed out. The call on the offload's error patch does not fullfil this requirement, leading to each error stealing a reference on net namespace. Create a function undoing what cls_bpf_set_parms() did and use it from __cls_bpf_delete_prog() and the error path. Fixes: aae2c35ec892 ("cls_bpf: use tcf_exts_get_net() before call_rcu()") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Daniel Borkmann Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index a9f3e317055c..6fe798c2df1a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -258,11 +258,8 @@ static int cls_bpf_init(struct tcf_proto *tp) return 0; } -static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) +static void cls_bpf_free_parms(struct cls_bpf_prog *prog) { - tcf_exts_destroy(&prog->exts); - tcf_exts_put_net(&prog->exts); - if (cls_bpf_is_ebpf(prog)) bpf_prog_put(prog->filter); else @@ -270,6 +267,14 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) kfree(prog->bpf_name); kfree(prog->bpf_ops); +} + +static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) +{ + tcf_exts_destroy(&prog->exts); + tcf_exts_put_net(&prog->exts); + + cls_bpf_free_parms(prog); kfree(prog); } @@ -514,12 +519,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, goto errout_idr; ret = cls_bpf_offload(tp, prog, oldprog); - if (ret) { - if (!oldprog) - idr_remove_ext(&head->handle_idr, prog->handle); - __cls_bpf_delete_prog(prog); - return ret; - } + if (ret) + goto errout_parms; if (!tc_in_hw(prog->gen_flags)) prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; @@ -537,6 +538,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = prog; return 0; +errout_parms: + cls_bpf_free_parms(prog); errout_idr: if (!oldprog) idr_remove_ext(&head->handle_idr, prog->handle); -- cgit v1.2.3 From f85729d07cd649bf69820f14bdad36326a24a699 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Nov 2017 14:28:39 +0100 Subject: sch_sfq: fix null pointer dereference at timer expiration While converting sch_sfq to use timer_setup(), the commit cdeabbb88134 ("net: sched: Convert timers to use timer_setup()") forgot to initialize the 'sch' field. As a result, the timer callback tries to dereference a NULL pointer, and the kernel does oops. Fix it initializing such field at qdisc creation time. Fixes: cdeabbb88134 ("net: sched: Convert timers to use timer_setup()") Signed-off-by: Paolo Abeni Acked-by: Cong Wang Acked-by: Kees Cook Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 890f4a4564e7..09c1203c1711 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -724,6 +724,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) int i; int err; + q->sch = sch; timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); err = tcf_block_get(&q->block, &q->filter_list, sch); -- cgit v1.2.3 From a8dd397903a6e57157f6265911f7d35681364427 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 26 Nov 2017 20:56:07 +0800 Subject: sctp: use right member as the param of list_for_each_entry Commit d04adf1b3551 ("sctp: reset owner sk for data chunks on out queues when migrating a sock") made a mistake that using 'list' as the param of list_for_each_entry to traverse the retransmit, sacked and abandoned queues, while chunks are using 'transmitted_list' to link into these queues. It could cause NULL dereference panic if there are chunks in any of these queues when peeling off one asoc. So use the chunk member 'transmitted_list' instead in this patch. Fixes: d04adf1b3551 ("sctp: reset owner sk for data chunks on out queues when migrating a sock") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 3204a9b29407..014847e25648 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -188,13 +188,13 @@ static void sctp_for_each_tx_datachunk(struct sctp_association *asoc, list_for_each_entry(chunk, &t->transmitted, transmitted_list) cb(chunk); - list_for_each_entry(chunk, &q->retransmit, list) + list_for_each_entry(chunk, &q->retransmit, transmitted_list) cb(chunk); - list_for_each_entry(chunk, &q->sacked, list) + list_for_each_entry(chunk, &q->sacked, transmitted_list) cb(chunk); - list_for_each_entry(chunk, &q->abandoned, list) + list_for_each_entry(chunk, &q->abandoned, transmitted_list) cb(chunk); list_for_each_entry(chunk, &q->out_chunk_list, list) -- cgit v1.2.3 From 4a5def7f6a758aef1a0a3b10e981881c1e914f69 Mon Sep 17 00:00:00 2001 From: Jorgen Hansen Date: Mon, 27 Nov 2017 05:29:32 -0800 Subject: VSOCK: Don't set sk_state to TCP_CLOSE before testing it A recent commit (3b4477d2dcf2) converted the sk_state to use TCP constants. In that change, vmci_transport_handle_detach was changed such that sk->sk_state was set to TCP_CLOSE before we test whether it is TCP_SYN_SENT. This change moves the sk_state change back to the original locations in that function. Signed-off-by: Jorgen Hansen Reviewed-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 56573dc85709..a7a73ffe675b 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -804,8 +804,6 @@ static void vmci_transport_handle_detach(struct sock *sk) */ if (vsk->local_addr.svm_cid == VMADDR_CID_HOST || vsock_stream_has_data(vsk) <= 0) { - sk->sk_state = TCP_CLOSE; - if (sk->sk_state == TCP_SYN_SENT) { /* The peer may detach from a queue pair while * we are still in the connecting state, i.e., @@ -815,10 +813,12 @@ static void vmci_transport_handle_detach(struct sock *sk) * event like a reset. */ + sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); return; } + sk->sk_state = TCP_CLOSE; } sk->sk_state_change(sk); } -- cgit v1.2.3 From d51aae68b142f48232257e96ce317db25445418d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Nov 2017 18:37:21 +0100 Subject: net: sched: cbq: create block for q->link.block q->link.block is not initialized, that leads to EINVAL when one tries to add filter there. So initialize it properly. This can be reproduced by: $ tc qdisc add dev eth0 root handle 1: cbq avpkt 1000 rate 1000Mbit bandwidth 1000Mbit $ tc filter add dev eth0 parent 1: protocol ip prio 100 u32 match ip protocol 0 0x00 flowid 1:1 Reported-by: Jaroslav Aster Reported-by: Ivan Vecera Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: Jiri Pirko Acked-by: Eelco Chaudron Reviewed-by: Ivan Vecera Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6361be7881f1..525eb3a6d625 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1158,9 +1158,13 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL) return -EINVAL; + err = tcf_block_get(&q->link.block, &q->link.filter_list, sch); + if (err) + goto put_rtab; + err = qdisc_class_hash_init(&q->clhash); if (err < 0) - goto put_rtab; + goto put_block; q->link.sibling = &q->link; q->link.common.classid = sch->handle; @@ -1194,6 +1198,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) cbq_addprio(q, &q->link); return 0; +put_block: + tcf_block_put(q->link.block); + put_rtab: qdisc_put_rtab(q->link.R_tab); return err; -- cgit v1.2.3 From 6a53b7593233ab9e4f96873ebacc0f653a55c3e1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 27 Nov 2017 11:15:16 -0800 Subject: xfrm: check id proto in validate_tmpl() syzbot reported a kernel warning in xfrm_state_fini(), which indicates that we have entries left in the list net->xfrm.state_all whose proto is zero. And xfrm_id_proto_match() doesn't consider them as a match with IPSEC_PROTO_ANY in this case. Proto with value 0 is probably not a valid value, at least verify_newsa_info() doesn't consider it valid either. This patch fixes it by checking the proto value in validate_tmpl() and rejecting invalid ones, like what iproute2 does in xfrm_xfrmproto_getbyname(). Reported-by: syzbot Cc: Steffen Klassert Cc: Herbert Xu Signed-off-by: Cong Wang Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 983b0233767b..c2cfcc6fdb34 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1445,6 +1445,21 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) default: return -EINVAL; } + + switch (ut[i].id.proto) { + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: +#endif + case IPSEC_PROTO_ANY: + break; + default: + return -EINVAL; + } + } return 0; -- cgit v1.2.3 From 3d7682af228fd78dc46bc6bf40e0268ad04521ec Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Nov 2017 14:25:50 +0000 Subject: rxrpc: Clean up whitespace Clean up some whitespace from rxrpc. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- net/rxrpc/conn_object.c | 2 +- net/rxrpc/input.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index bda952ffe6a6..555274ddc514 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -426,7 +426,7 @@ recheck_state: next = call->expect_rx_by; #define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; } - + set(call->expect_req_by); set(call->expect_term_by); set(call->ack_at); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 1aad04a32d5e..c628351eb900 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -424,7 +424,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) if (earliest != now + MAX_JIFFY_OFFSET) { _debug("reschedule reaper %ld", (long)earliest - (long)now); ASSERT(time_after(earliest, now)); - rxrpc_set_service_reap_timer(rxnet, earliest); + rxrpc_set_service_reap_timer(rxnet, earliest); } while (!list_empty(&graveyard)) { diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 23a5e61d8f79..6fc61400337f 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -976,7 +976,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, rxrpc_reduce_call_timer(call, expect_rx_by, now, rxrpc_timer_set_for_normal); } - + switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: rxrpc_input_data(call, skb, skew); @@ -1213,7 +1213,7 @@ void rxrpc_data_ready(struct sock *udp_sk) goto reupgrade; conn->service_id = sp->hdr.serviceId; } - + if (sp->hdr.callNumber == 0) { /* Connection-level packet */ _debug("CONN %p {%d}", conn, conn->debug_id); -- cgit v1.2.3 From 5fc62f6a139a7b06b027bf442cd4205619506f59 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Nov 2017 14:40:41 +0000 Subject: rxrpc: Fix ACK generation from the connection event processor Repeat terminal ACKs and now terminal ACKs are now generated from the connection event processor rather from call handling as this allows us to discard client call structures as soon as possible and free up the channel for a follow on call. However, in ACKs so generated, the additional information trailer is malformed because the padding that's meant to be in the middle isn't included in what's transmitted. Fix it so that the 3 bytes of padding are included in the transmission. Further, the trailer is misaligned because of the padding, so assigment to the u16 and u32 fields inside it might cause problems on some arches, so fix this by breaking the padding and the trailer out of the packed struct. (This also deals with potential compiler weirdies where some of the nested structs are packed and some aren't). The symptoms can be seen in wireshark as terminal DUPLICATE or IDLE ACK packets in which the Max MTU, Interface MTU and rwind fields have weird values and the Max Packets field is apparently missing. Reported-by: Jeffrey Altman Signed-off-by: David Howells --- net/rxrpc/conn_event.c | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 9e9a8db1bc9c..4ca11be6be3c 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -30,22 +30,18 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct rxrpc_skb_priv *sp = skb ? rxrpc_skb(skb) : NULL; struct rxrpc_channel *chan; struct msghdr msg; - struct kvec iov; + struct kvec iov[3]; struct { struct rxrpc_wire_header whdr; union { - struct { - __be32 code; - } abort; - struct { - struct rxrpc_ackpacket ack; - u8 padding[3]; - struct rxrpc_ackinfo info; - }; + __be32 abort_code; + struct rxrpc_ackpacket ack; }; } __attribute__((packed)) pkt; + struct rxrpc_ackinfo ack_info; size_t len; - u32 serial, mtu, call_id; + int ioc; + u32 serial, mtu, call_id, padding; _enter("%d", conn->debug_id); @@ -66,6 +62,13 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, msg.msg_controllen = 0; msg.msg_flags = 0; + iov[0].iov_base = &pkt; + iov[0].iov_len = sizeof(pkt.whdr); + iov[1].iov_base = &padding; + iov[1].iov_len = 3; + iov[2].iov_base = &ack_info; + iov[2].iov_len = sizeof(ack_info); + pkt.whdr.epoch = htonl(conn->proto.epoch); pkt.whdr.cid = htonl(conn->proto.cid); pkt.whdr.callNumber = htonl(call_id); @@ -80,8 +83,10 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, len = sizeof(pkt.whdr); switch (chan->last_type) { case RXRPC_PACKET_TYPE_ABORT: - pkt.abort.code = htonl(chan->last_abort); - len += sizeof(pkt.abort); + pkt.abort_code = htonl(chan->last_abort); + iov[0].iov_len += sizeof(pkt.abort_code); + len += sizeof(pkt.abort_code); + ioc = 1; break; case RXRPC_PACKET_TYPE_ACK: @@ -94,13 +99,19 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - pkt.info.rxMTU = htonl(rxrpc_rx_mtu); - pkt.info.maxMTU = htonl(mtu); - pkt.info.rwind = htonl(rxrpc_rx_window_size); - pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); + ack_info.rxMTU = htonl(rxrpc_rx_mtu); + ack_info.maxMTU = htonl(mtu); + ack_info.rwind = htonl(rxrpc_rx_window_size); + ack_info.jumbo_max = htonl(rxrpc_rx_jumbo_max); pkt.whdr.flags |= RXRPC_SLOW_START_OK; - len += sizeof(pkt.ack) + sizeof(pkt.info); + padding = 0; + iov[0].iov_len += sizeof(pkt.ack); + len += sizeof(pkt.ack) + 3 + sizeof(ack_info); + ioc = 3; break; + + default: + return; } /* Resync with __rxrpc_disconnect_call() and check that the last call @@ -110,9 +121,6 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, if (READ_ONCE(chan->last_call) != call_id) return; - iov.iov_base = &pkt; - iov.iov_len = len; - serial = atomic_inc_return(&conn->serial); pkt.whdr.serial = htonl(serial); @@ -127,7 +135,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; } - kernel_sendmsg(conn->params.local->socket, &msg, &iov, 1, len); + kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); _leave(""); return; } -- cgit v1.2.3 From 282ef4729195c8503f7101d574acfb5e7c8a8209 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 28 Nov 2017 11:28:52 -0600 Subject: rxrpc: Fix variable overwrite Values assigned to both variable resend_at and ack_at are overwritten before they can be used. The correct fix here is to add 'now' to the previously computed value in resend_at and ack_at. Addresses-Coverity-ID: 1462262 Addresses-Coverity-ID: 1462263 Addresses-Coverity-ID: 1462264 Fixes: beb8e5e4f38c ("rxrpc: Express protocol timeouts in terms of RTT") Link: https://marc.info/?i=17004.1511808959%40warthog.procyon.org.uk Signed-off-by: Gustavo A. R. Silva Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- net/rxrpc/sendmsg.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 555274ddc514..ad2ab1103189 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -123,7 +123,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, else ack_at = expiry; - ack_at = jiffies + expiry; + ack_at += now; if (time_before(ack_at, call->ack_at)) { WRITE_ONCE(call->ack_at, ack_at); rxrpc_reduce_call_timer(call, ack_at, now, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index a1c53ac066a1..09f2a3e05221 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -233,7 +233,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, if (resend_at < 1) resend_at = 1; - resend_at = now + rxrpc_resend_timeout; + resend_at += now; WRITE_ONCE(call->resend_at, resend_at); rxrpc_reduce_call_timer(call, resend_at, now, rxrpc_timer_set_for_send); -- cgit v1.2.3 From 4ba161a793d5f43757c35feff258d9f20a082940 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 24 Nov 2017 12:00:24 -0500 Subject: SUNRPC: Allow connect to return EHOSTUNREACH Reported-by: Dmitry Vyukov Signed-off-by: Trond Myklebust Tested-by: Dmitry Vyukov Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 684e356b40e4..c2b2d489b57b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2440,6 +2440,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -ECONNREFUSED: case -ECONNRESET: case -ENETUNREACH: + case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: /* -- cgit v1.2.3 From 90a6ec85351b31449c2c6b5406b5396ac96f191d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 29 Nov 2017 16:07:51 -0800 Subject: act_sample: get rid of tcf_sample_cleanup_rcu() Similar to commit d7fb60b9cafb ("net_sched: get rid of tcfa_rcu"), TC actions don't need to respect RCU grace period, because it is either just detached from tc filter (standalone case) or it is removed together with tc filter (bound case) in which case RCU grace period is already respected at filter layer. Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action") Reported-by: Eric Dumazet Cc: Jamal Hadi Salim Cc: Jiri Pirko Cc: Yotam Gigi Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/act_sample.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 8b5abcd2f32f..9438969290a6 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -96,23 +96,16 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_sample_cleanup_rcu(struct rcu_head *rcu) +static void tcf_sample_cleanup(struct tc_action *a, int bind) { - struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu); + struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; - psample_group = rcu_dereference_protected(s->psample_group, 1); + psample_group = rtnl_dereference(s->psample_group); RCU_INIT_POINTER(s->psample_group, NULL); psample_group_put(psample_group); } -static void tcf_sample_cleanup(struct tc_action *a, int bind) -{ - struct tcf_sample *s = to_sample(a); - - call_rcu(&s->rcu, tcf_sample_cleanup_rcu); -} - static bool tcf_sample_dev_ok_push(struct net_device *dev) { switch (dev->type) { @@ -264,7 +257,6 @@ static int __init sample_init_module(void) static void __exit sample_cleanup_module(void) { - rcu_barrier(); tcf_unregister_action(&act_sample_ops, &sample_net_ops); } -- cgit v1.2.3 From 3016dad75b48279e579117ee3ed566ba90a3b023 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Nov 2017 17:43:57 -0800 Subject: tcp: remove buggy call to tcp_v6_restore_cb() tcp_v6_send_reset() expects to receive an skb with skb->cb[] layout as used in TCP stack. MD5 lookup uses tcp_v6_iif() and tcp_v6_sdif() and thus TCP_SKB_CB(skb)->header.h6 This patch probably fixes RST packets sent on behalf of a timewait md5 ipv6 socket. Before Florian patch, tcp_v6_restore_cb() was needed before jumping to no_tcp_socket label. Fixes: 271c3b9b7bda ("tcp: honour SO_BINDTODEVICE for TW_RST case too") Signed-off-by: Eric Dumazet Cc: Florian Westphal Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6bb98c93edfe..be11dc13aa70 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1590,7 +1590,6 @@ do_time_wait: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: - tcp_v6_restore_cb(skb); tcp_v6_send_reset(sk, skb); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; -- cgit v1.2.3 From f859b4af1c52493ec21173ccc73d0b60029b5b88 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 30 Nov 2017 10:41:14 +0800 Subject: sit: update frag_off info After parsing the sit netlink change info, we forget to update frag_off in ipip6_tunnel_update(). Fix it by assigning frag_off with new value. Reported-by: Jianlin Shi Signed-off-by: Hangbin Liu Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/sit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d60ddcb0bfe2..d7dc23c1b2ca 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1098,6 +1098,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; + t->parms.iph.frag_off = p->iph.frag_off; if (t->parms.link != p->link || t->fwmark != fwmark) { t->parms.link = p->link; t->fwmark = fwmark; -- cgit v1.2.3 From eb5b46faa693470681ec7c28cc2436edd1571198 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 30 Nov 2017 07:21:33 -0500 Subject: SUNRPC: Handle ENETDOWN errors Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 5 +++++ net/sunrpc/xprtsock.c | 1 + 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a801da812f86..e2a4184f3c5d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1841,6 +1841,7 @@ call_bind_status(struct rpc_task *task) case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: + case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -ENOBUFS: @@ -1917,6 +1918,7 @@ call_connect_status(struct rpc_task *task) /* fall through */ case -ECONNRESET: case -ECONNABORTED: + case -ENETDOWN: case -ENETUNREACH: case -EHOSTUNREACH: case -EADDRINUSE: @@ -2022,6 +2024,7 @@ call_transmit_status(struct rpc_task *task) */ case -ECONNREFUSED: case -EHOSTDOWN: + case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: @@ -2071,6 +2074,7 @@ call_bc_transmit(struct rpc_task *task) switch (task->tk_status) { case 0: /* Success */ + case -ENETDOWN: case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: @@ -2139,6 +2143,7 @@ call_status(struct rpc_task *task) task->tk_status = 0; switch(status) { case -EHOSTDOWN: + case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c2b2d489b57b..7fa94abfd6b2 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2439,6 +2439,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) */ case -ECONNREFUSED: case -ECONNRESET: + case -ENETDOWN: case -ENETUNREACH: case -EHOSTUNREACH: case -EADDRINUSE: -- cgit v1.2.3 From e719135881f00c01ca400abb8a5dadaf297a24f9 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 29 Nov 2017 18:23:56 +0100 Subject: xfrm: fix XFRMA_OUTPUT_MARK policy entry This seems to be an obvious typo, NLA_U32 is type of the attribute, not its (minimal) length. Fixes: 077fbac405bf ("net: xfrm: support setting an output mark.") Signed-off-by: Michal Kubecek Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c2cfcc6fdb34..ff58c37469d6 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2485,7 +2485,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, - [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 }, + [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { -- cgit v1.2.3 From 4ce3dbe397d7b6b15f272ae757c78c35e9e4b61d Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Tue, 28 Nov 2017 19:55:40 +0200 Subject: xfrm: Fix xfrm_input() to verify state is valid when (encap_type < 0) Code path when (encap_type < 0) does not verify the state is valid before progressing. This will result in a crash if, for instance, x->km.state == XFRM_STATE_ACQ. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Signed-off-by: Aviv Heller Signed-off-by: Yevgeny Kliteynik Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 347ab31574d5..da6447389ffb 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -207,7 +207,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) xfrm_address_t *daddr; struct xfrm_mode *inner_mode; u32 mark = skb->mark; - unsigned int family; + unsigned int family = AF_UNSPEC; int decaps = 0; int async = 0; bool xfrm_gro = false; @@ -216,6 +216,16 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) if (encap_type < 0) { x = xfrm_input_state(skb); + + if (unlikely(x->km.state != XFRM_STATE_VALID)) { + if (x->km.state == XFRM_STATE_ACQ) + XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); + else + XFRM_INC_STATS(net, + LINUX_MIB_XFRMINSTATEINVALID); + goto drop; + } + family = x->outer_mode->afinfo->family; /* An encap_type of -1 indicates async resumption. */ -- cgit v1.2.3 From ddc47e4404b58f03e98345398fb12d38fe291512 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 29 Nov 2017 06:53:55 +0100 Subject: xfrm: Fix stack-out-of-bounds read on socket policy lookup. When we do tunnel or beet mode, we pass saddr and daddr from the template to xfrm_state_find(), this is ok. On transport mode, we pass the addresses from the flowi, assuming that the IP addresses (and address family) don't change during transformation. This assumption is wrong in the IPv4 mapped IPv6 case, packet is IPv4 and template is IPv6. Fix this by catching address family missmatches of the policy and the flow already before we do the lookup. Reported-by: syzbot Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9542975eb2f9..038ec68f6901 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1168,9 +1168,15 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { - bool match = xfrm_selector_match(&pol->selector, fl, family); + bool match; int err = 0; + if (pol->family != family) { + pol = NULL; + goto out; + } + + match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((sk->sk_mark & pol->mark.m) != pol->mark.v) { pol = NULL; -- cgit v1.2.3 From fe77d8257c4d838c5976557ddb87bd789f312412 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 29 Nov 2017 10:25:02 +0100 Subject: batman-adv: Always initialize fragment header priority The batman-adv unuicast fragment header contains 3 bits for the priority of the packet. These bits will be initialized when the skb->priority contains a value between 256 and 263. But otherwise, the uninitialized bits from the stack will be used. Fixes: c0f25c802b33 ("batman-adv: Include frame priority in fragment header") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index a98cf1104a30..ebe6e38934e4 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -499,6 +499,8 @@ int batadv_frag_send_packet(struct sk_buff *skb, */ if (skb->priority >= 256 && skb->priority <= 263) frag_header.priority = skb->priority - 256; + else + frag_header.priority = 0; ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr); ether_addr_copy(frag_header.dest, orig_node->orig); -- cgit v1.2.3 From 198a62ddffa4a4ffaeb741f642b7b52f2d91ae9b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 29 Nov 2017 10:50:42 +0100 Subject: batman-adv: Fix check of retrieved orig_gw in batadv_v_gw_is_eligible The batadv_v_gw_is_eligible function already assumes that orig_node is not NULL. But batadv_gw_node_get may have failed to find the originator. It must therefore be checked whether the batadv_gw_node_get failed and not whether orig_node is NULL to detect this error. Fixes: 50164d8f500f ("batman-adv: B.A.T.M.A.N. V - implement GW selection logic") Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 341ceab8338d..e0e2bfcd6b3e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -814,7 +814,7 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, } orig_gw = batadv_gw_node_get(bat_priv, orig_node); - if (!orig_node) + if (!orig_gw) goto out; if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) -- cgit v1.2.3 From d30fc5126efb0c33b7adf5966d3051db2c3d7721 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:18:34 +0800 Subject: sctp: only update outstanding_bytes for transmitted queue when doing prsctp_prune Now outstanding_bytes is only increased when appending chunks into one packet and sending it at 1st time, while decreased when it is about to move into retransmit queue. It means outstanding_bytes value is already decreased for all chunks in retransmit queue. However sctp_prsctp_prune_sent is a common function to check the chunks in both transmitted and retransmit queue, it decrease outstanding_bytes when moving a chunk into abandoned queue from either of them. It could cause outstanding_bytes underflow, as it also decreases it's value for the chunks in retransmit queue. This patch fixes it by only updating outstanding_bytes for transmitted queue when pruning queues for prsctp prio policy, the same fix is also needed in sctp_check_transmitted. Fixes: 8dbdf1f5b09c ("sctp: implement prsctp PRIO policy") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 4db012aa25f7..7029f8b99063 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -377,7 +377,8 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; - if (!chk->tsn_gap_acked) { + if (queue != &asoc->outqueue.retransmit && + !chk->tsn_gap_acked) { if (chk->transport) chk->transport->flight_size -= sctp_data_size(chk); @@ -1434,7 +1435,8 @@ static void sctp_check_transmitted(struct sctp_outq *q, /* If this chunk has not been acked, stop * considering it as 'outstanding'. */ - if (!tchunk->tsn_gap_acked) { + if (transmitted_queue != &q->retransmit && + !tchunk->tsn_gap_acked) { if (tchunk->transport) tchunk->transport->flight_size -= sctp_data_size(tchunk); -- cgit v1.2.3 From e5f612969c6f965e3bd1158598e0a3b1c4f389b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:18:35 +0800 Subject: sctp: abandon the whole msg if one part of a fragmented message is abandoned As rfc3758#section-3.1 demands: A3) When a TSN is "abandoned", if it is part of a fragmented message, all other TSN's within that fragmented message MUST be abandoned at the same time. Besides, if it couldn't handle this, the rest frags would never get assembled in peer side. This patch supports it by adding abandoned flag in sctp_datamsg, when one chunk is being abandoned, set chunk->msg->abandoned as well. Next time when checking for abandoned, go checking chunk->msg->abandoned first. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/chunk.c | 7 +++++++ net/sctp/outqueue.c | 12 ++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 7b261afc47b9..9213805b558d 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -53,6 +53,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg) msg->send_failed = 0; msg->send_error = 0; msg->can_delay = 1; + msg->abandoned = 0; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); } @@ -304,6 +305,9 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (!chunk->asoc->peer.prsctp_capable) return 0; + if (chunk->msg->abandoned) + return 1; + if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = @@ -316,6 +320,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } + chunk->msg->abandoned = 1; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { @@ -324,10 +329,12 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + chunk->msg->abandoned = 1; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && time_after(jiffies, chunk->msg->expires_at)) { + chunk->msg->abandoned = 1; return 1; } /* PRIO policy is processed by sendmsg, not here */ diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 7029f8b99063..4ab164b5aad0 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -364,10 +364,12 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, list_for_each_entry_safe(chk, temp, queue, transmitted_list) { struct sctp_stream_out *streamout; - if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) + if (!chk->msg->abandoned && + (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; + chk->msg->abandoned = 1; list_del_init(&chk->transmitted_list); sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); @@ -404,10 +406,12 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, q->sched->unsched_all(&asoc->stream); list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { - if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) + if (!chk->msg->abandoned && + (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; + chk->msg->abandoned = 1; sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; -- cgit v1.2.3 From 779edd7348878a7376c0e3d0f96485c30b5f1b7d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:18:36 +0800 Subject: sctp: do not abandon the other frags in unsent outq if one msg has outstanding frags Now for the abandoned chunks in unsent outq, it would just free the chunks. Because no tsn is assigned to them yet, there's no need to send fwd tsn to peer, unlike for the abandoned chunks in sent outq. The problem is when parts of the msg have been sent and the other frags are still in unsent outq, if they are abandoned/dropped, the peer would never get this msg reassembled. So these frags in unsent outq can't be dropped if this msg already has outstanding frags. This patch does the check in sctp_chunk_abandoned and sctp_prsctp_prune_unsent. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/chunk.c | 4 ++++ net/sctp/outqueue.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 9213805b558d..7f8baa48e7c2 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -308,6 +308,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (chunk->msg->abandoned) return 1; + if (!chunk->has_tsn && + !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)) + return 0; + if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 4ab164b5aad0..7d67feeeffc1 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -407,7 +407,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!chk->msg->abandoned && - (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + (!(chk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) || + !SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; -- cgit v1.2.3 From cfac7f836a715b91f08c851df915d401a4d52783 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Dec 2017 10:06:56 -0800 Subject: tcp/dccp: block bh before arming time_wait timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maciej Żenczykowski reported some panics in tcp_twsk_destructor() that might be caused by the following bug. timewait timer is pinned to the cpu, because we want to transition timwewait refcount from 0 to 4 in one go, once everything has been initialized. At the time commit ed2e92394589 ("tcp/dccp: fix timewait races in timer handling") was merged, TCP was always running from BH habdler. After commit 5413d1babe8f ("net: do not block BH while processing socket backlog") we definitely can run tcp_time_wait() from process context. We need to block BH in the critical section so that the pinned timer has still its purpose. This bug is more likely to happen under stress and when very small RTO are used in datacenter flows. Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") Signed-off-by: Eric Dumazet Reported-by: Maciej Żenczykowski Acked-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/dccp/minisocks.c | 6 ++++++ net/ipv4/tcp_minisocks.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'net') diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index abd07a443219..178bb9833311 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -57,10 +57,16 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (state == DCCP_TIME_WAIT) timeo = DCCP_TIMEWAIT_LEN; + /* tw_timer is pinned, so we need to make sure BH are disabled + * in following section, otherwise timer handler could run before + * we complete the initialization. + */ + local_bh_disable(); inet_twsk_schedule(tw, timeo); /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); inet_twsk_put(tw); + local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e36eff0403f4..b079b619b60c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -310,10 +310,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; + /* tw_timer is pinned, so we need to make sure BH are disabled + * in following section, otherwise timer handler could run before + * we complete the initialization. + */ + local_bh_disable(); inet_twsk_schedule(tw, timeo); /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); + local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than -- cgit v1.2.3 From c7799c067c2ae33e348508c8afec354f3257ff25 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Wed, 29 Nov 2017 12:48:42 +0200 Subject: tipc: call tipc_rcv() only if bearer is up in tipc_udp_recv() Remove the second tipc_rcv() call in tipc_udp_recv(). We have just checked that the bearer is not up, and calling tipc_rcv() with a bearer that is not up leads to a TIPC div-by-zero crash in tipc_node_calculate_timer(). The crash is rare in practice, but can happen like this: We're enabling a bearer, but it's not yet up and fully initialized. At the same time we receive a discovery packet, and in tipc_udp_recv() we end up calling tipc_rcv() with the not-yet-initialized bearer, causing later the div-by-zero crash in tipc_node_calculate_timer(). Jon Maloy explains the impact of removing the second tipc_rcv() call: "link setup in the worst case will be delayed until the next arriving discovery messages, 1 sec later, and this is an acceptable delay." As the tipc_rcv() call is removed, just leave the function via the rcu_out label, so that we will kfree_skb(). [ 12.590450] Own node address <1.1.1>, network identity 1 [ 12.668088] divide error: 0000 [#1] SMP [ 12.676952] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.14.2-dirty #1 [ 12.679225] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 [ 12.682095] task: ffff8c2a761edb80 task.stack: ffffa41cc0cac000 [ 12.684087] RIP: 0010:tipc_node_calculate_timer.isra.12+0x45/0x60 [tipc] [ 12.686486] RSP: 0018:ffff8c2a7fc838a0 EFLAGS: 00010246 [ 12.688451] RAX: 0000000000000000 RBX: ffff8c2a5b382600 RCX: 0000000000000000 [ 12.691197] RDX: 0000000000000000 RSI: ffff8c2a5b382600 RDI: ffff8c2a5b382600 [ 12.693945] RBP: ffff8c2a7fc838b0 R08: 0000000000000001 R09: 0000000000000001 [ 12.696632] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8c2a5d8949d8 [ 12.699491] R13: ffffffff95ede400 R14: 0000000000000000 R15: ffff8c2a5d894800 [ 12.702338] FS: 0000000000000000(0000) GS:ffff8c2a7fc80000(0000) knlGS:0000000000000000 [ 12.705099] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 12.706776] CR2: 0000000001bb9440 CR3: 00000000bd009001 CR4: 00000000003606e0 [ 12.708847] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 12.711016] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 12.712627] Call Trace: [ 12.713390] [ 12.714011] tipc_node_check_dest+0x2e8/0x350 [tipc] [ 12.715286] tipc_disc_rcv+0x14d/0x1d0 [tipc] [ 12.716370] tipc_rcv+0x8b0/0xd40 [tipc] [ 12.717396] ? minmax_running_min+0x2f/0x60 [ 12.718248] ? dst_alloc+0x4c/0xa0 [ 12.718964] ? tcp_ack+0xaf1/0x10b0 [ 12.719658] ? tipc_udp_is_known_peer+0xa0/0xa0 [tipc] [ 12.720634] tipc_udp_recv+0x71/0x1d0 [tipc] [ 12.721459] ? dst_alloc+0x4c/0xa0 [ 12.722130] udp_queue_rcv_skb+0x264/0x490 [ 12.722924] __udp4_lib_rcv+0x21e/0x990 [ 12.723670] ? ip_route_input_rcu+0x2dd/0xbf0 [ 12.724442] ? tcp_v4_rcv+0x958/0xa40 [ 12.725039] udp_rcv+0x1a/0x20 [ 12.725587] ip_local_deliver_finish+0x97/0x1d0 [ 12.726323] ip_local_deliver+0xaf/0xc0 [ 12.726959] ? ip_route_input_noref+0x19/0x20 [ 12.727689] ip_rcv_finish+0xdd/0x3b0 [ 12.728307] ip_rcv+0x2ac/0x360 [ 12.728839] __netif_receive_skb_core+0x6fb/0xa90 [ 12.729580] ? udp4_gro_receive+0x1a7/0x2c0 [ 12.730274] __netif_receive_skb+0x1d/0x60 [ 12.730953] ? __netif_receive_skb+0x1d/0x60 [ 12.731637] netif_receive_skb_internal+0x37/0xd0 [ 12.732371] napi_gro_receive+0xc7/0xf0 [ 12.732920] receive_buf+0x3c3/0xd40 [ 12.733441] virtnet_poll+0xb1/0x250 [ 12.733944] net_rx_action+0x23e/0x370 [ 12.734476] __do_softirq+0xc5/0x2f8 [ 12.734922] irq_exit+0xfa/0x100 [ 12.735315] do_IRQ+0x4f/0xd0 [ 12.735680] common_interrupt+0xa2/0xa2 [ 12.736126] [ 12.736416] RIP: 0010:native_safe_halt+0x6/0x10 [ 12.736925] RSP: 0018:ffffa41cc0cafe90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff4d [ 12.737756] RAX: 0000000000000000 RBX: ffff8c2a761edb80 RCX: 0000000000000000 [ 12.738504] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 12.739258] RBP: ffffa41cc0cafe90 R08: 0000014b5b9795e5 R09: ffffa41cc12c7e88 [ 12.740118] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002 [ 12.740964] R13: ffff8c2a761edb80 R14: 0000000000000000 R15: 0000000000000000 [ 12.741831] default_idle+0x2a/0x100 [ 12.742323] arch_cpu_idle+0xf/0x20 [ 12.742796] default_idle_call+0x28/0x40 [ 12.743312] do_idle+0x179/0x1f0 [ 12.743761] cpu_startup_entry+0x1d/0x20 [ 12.744291] start_secondary+0x112/0x120 [ 12.744816] secondary_startup_64+0xa5/0xa5 [ 12.745367] Code: b9 f4 01 00 00 48 89 c2 48 c1 ea 02 48 3d d3 07 00 00 48 0f 47 d1 49 8b 0c 24 48 39 d1 76 07 49 89 14 24 48 89 d1 31 d2 48 89 df <48> f7 f1 89 c6 e8 81 6e ff ff 5b 41 5c 5d c3 66 90 66 2e 0f 1f [ 12.747527] RIP: tipc_node_calculate_timer.isra.12+0x45/0x60 [tipc] RSP: ffff8c2a7fc838a0 [ 12.748555] ---[ end trace 1399ab83390650fd ]--- [ 12.749296] Kernel panic - not syncing: Fatal exception in interrupt [ 12.750123] Kernel Offset: 0x13200000 from 0xffffffff82000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 12.751215] Rebooting in 60 seconds.. Fixes: c9b64d492b1f ("tipc: add replicast peer discovery") Signed-off-by: Tommi Rantala Cc: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/udp_media.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ecca64fc6a6f..3deabcab4882 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -371,10 +371,6 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) goto rcu_out; } - tipc_rcv(sock_net(sk), skb, b); - rcu_read_unlock(); - return 0; - rcu_out: rcu_read_unlock(); out: -- cgit v1.2.3 From 974a6b20518602310637bd8ac9ad348bf8a864d6 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 1 Dec 2017 11:47:56 +0100 Subject: batman-adv: Fix kernel-doc for timer functions The commit e99e88a9d2b0 ("treewide: setup_timer() -> timer_setup()") changed the argument name and type of the timer function but didn't adjust the kernel-doc of these functions. Signed-off-by: Sven Eckelmann Acked-by: Kees Cook Signed-off-by: Simon Wunderlich --- net/batman-adv/tp_meter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 15cd2139381e..ebc4e2241c77 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -482,7 +482,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) /** * batadv_tp_sender_timeout - timer that fires in case of packet loss - * @arg: address of the related tp_vars + * @t: address to timer_list inside tp_vars * * If fired it means that there was packet loss. * Switch to Slow Start, set the ss_threshold to half of the current cwnd and @@ -1106,7 +1106,7 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars) /** * batadv_tp_receiver_shutdown - stop a tp meter receiver when timeout is * reached without received ack - * @arg: address of the related tp_vars + * @t: address to timer_list inside tp_vars */ static void batadv_tp_receiver_shutdown(struct timer_list *t) { -- cgit v1.2.3 From c501256406fb19c306504ee1fe41a4ea208d4245 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 1 Dec 2017 11:09:53 +0000 Subject: rxrpc: Use correct netns source in rxrpc_release_sock() In rxrpc_release_sock() there may be no rx->local value to access, so we can't unconditionally follow it to the rxrpc network namespace information to poke the connection reapers. Instead, use the socket's namespace pointer to find the namespace. This unfixed code causes the following static checker warning: net/rxrpc/af_rxrpc.c:898 rxrpc_release_sock() error: we previously assumed 'rx->local' could be null (see line 887) Fixes: 3d18cbb7fd0c ("rxrpc: Fix conn expiry timers") Reported-by: Dan Carpenter Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 8f7cf4c042be..dcd818fa837e 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -860,6 +860,7 @@ static void rxrpc_sock_destructor(struct sock *sk) static int rxrpc_release_sock(struct sock *sk) { struct rxrpc_sock *rx = rxrpc_sk(sk); + struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); _enter("%p{%d,%d}", sk, sk->sk_state, refcount_read(&sk->sk_refcnt)); @@ -895,8 +896,8 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); - rxrpc_queue_work(&rx->local->rxnet->service_conn_reaper); - rxrpc_queue_work(&rx->local->rxnet->client_conn_reaper); + rxrpc_queue_work(&rxnet->service_conn_reaper); + rxrpc_queue_work(&rxnet->client_conn_reaper); rxrpc_put_local(rx->local); rx->local = NULL; -- cgit v1.2.3 From eeea10b83a139451130df1594f26710c8fa390c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 Dec 2017 09:32:59 -0800 Subject: tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb() James Morris reported kernel stack corruption bug [1] while running the SELinux testsuite, and bisected to a recent commit bffa72cf7f9d ("net: sk_buff rbnode reorg") We believe this commit is fine, but exposes an older bug. SELinux code runs from tcp_filter() and might send an ICMP, expecting IP options to be found in skb->cb[] using regular IPCB placement. We need to defer TCP mangling of skb->cb[] after tcp_filter() calls. This patch adds tcp_v4_fill_cb()/tcp_v4_restore_cb() in a very similar way we added them for IPv6. [1] [ 339.806024] SELinux: failure in selinux_parse_skb(), unable to parse packet [ 339.822505] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff81745af5 [ 339.822505] [ 339.852250] CPU: 4 PID: 3642 Comm: client Not tainted 4.15.0-rc1-test #15 [ 339.868498] Hardware name: LENOVO 10FGS0VA1L/30BC, BIOS FWKT68A 01/19/2017 [ 339.885060] Call Trace: [ 339.896875] [ 339.908103] dump_stack+0x63/0x87 [ 339.920645] panic+0xe8/0x248 [ 339.932668] ? ip_push_pending_frames+0x33/0x40 [ 339.946328] ? icmp_send+0x525/0x530 [ 339.958861] ? kfree_skbmem+0x60/0x70 [ 339.971431] __stack_chk_fail+0x1b/0x20 [ 339.984049] icmp_send+0x525/0x530 [ 339.996205] ? netlbl_skbuff_err+0x36/0x40 [ 340.008997] ? selinux_netlbl_err+0x11/0x20 [ 340.021816] ? selinux_socket_sock_rcv_skb+0x211/0x230 [ 340.035529] ? security_sock_rcv_skb+0x3b/0x50 [ 340.048471] ? sk_filter_trim_cap+0x44/0x1c0 [ 340.061246] ? tcp_v4_inbound_md5_hash+0x69/0x1b0 [ 340.074562] ? tcp_filter+0x2c/0x40 [ 340.086400] ? tcp_v4_rcv+0x820/0xa20 [ 340.098329] ? ip_local_deliver_finish+0x71/0x1a0 [ 340.111279] ? ip_local_deliver+0x6f/0xe0 [ 340.123535] ? ip_rcv_finish+0x3a0/0x3a0 [ 340.135523] ? ip_rcv_finish+0xdb/0x3a0 [ 340.147442] ? ip_rcv+0x27c/0x3c0 [ 340.158668] ? inet_del_offload+0x40/0x40 [ 340.170580] ? __netif_receive_skb_core+0x4ac/0x900 [ 340.183285] ? rcu_accelerate_cbs+0x5b/0x80 [ 340.195282] ? __netif_receive_skb+0x18/0x60 [ 340.207288] ? process_backlog+0x95/0x140 [ 340.218948] ? net_rx_action+0x26c/0x3b0 [ 340.230416] ? __do_softirq+0xc9/0x26a [ 340.241625] ? do_softirq_own_stack+0x2a/0x40 [ 340.253368] [ 340.262673] ? do_softirq+0x50/0x60 [ 340.273450] ? __local_bh_enable_ip+0x57/0x60 [ 340.285045] ? ip_finish_output2+0x175/0x350 [ 340.296403] ? ip_finish_output+0x127/0x1d0 [ 340.307665] ? nf_hook_slow+0x3c/0xb0 [ 340.318230] ? ip_output+0x72/0xe0 [ 340.328524] ? ip_fragment.constprop.54+0x80/0x80 [ 340.340070] ? ip_local_out+0x35/0x40 [ 340.350497] ? ip_queue_xmit+0x15c/0x3f0 [ 340.361060] ? __kmalloc_reserve.isra.40+0x31/0x90 [ 340.372484] ? __skb_clone+0x2e/0x130 [ 340.382633] ? tcp_transmit_skb+0x558/0xa10 [ 340.393262] ? tcp_connect+0x938/0xad0 [ 340.403370] ? ktime_get_with_offset+0x4c/0xb0 [ 340.414206] ? tcp_v4_connect+0x457/0x4e0 [ 340.424471] ? __inet_stream_connect+0xb3/0x300 [ 340.435195] ? inet_stream_connect+0x3b/0x60 [ 340.445607] ? SYSC_connect+0xd9/0x110 [ 340.455455] ? __audit_syscall_entry+0xaf/0x100 [ 340.466112] ? syscall_trace_enter+0x1d0/0x2b0 [ 340.476636] ? __audit_syscall_exit+0x209/0x290 [ 340.487151] ? SyS_connect+0xe/0x10 [ 340.496453] ? do_syscall_64+0x67/0x1b0 [ 340.506078] ? entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Signed-off-by: Eric Dumazet Reported-by: James Morris Tested-by: James Morris Tested-by: Casey Schaufler Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 59 ++++++++++++++++++++++++++++++++++++----------------- net/ipv6/tcp_ipv6.c | 10 +++++---- 2 files changed, 46 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c6bc0c4d19c6..77ea45da0fe9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1591,6 +1591,34 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_filter); +static void tcp_v4_restore_cb(struct sk_buff *skb) +{ + memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, + sizeof(struct inet_skb_parm)); +} + +static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, + const struct tcphdr *th) +{ + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), + sizeof(struct inet_skb_parm)); + barrier(); + + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * 4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->has_rxtstamp = + skb->tstamp || skb_hwtstamps(skb)->hwtstamp; +} + /* * From tcp_input.c */ @@ -1631,24 +1659,6 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); - /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() - * barrier() makes sure compiler wont play fool^Waliasing games. - */ - memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), - sizeof(struct inet_skb_parm)); - barrier(); - - TCP_SKB_CB(skb)->seq = ntohl(th->seq); - TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + - skb->len - th->doff * 4); - TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; - TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->has_rxtstamp = - skb->tstamp || skb_hwtstamps(skb)->hwtstamp; - lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); @@ -1679,14 +1689,19 @@ process: sock_hold(sk); refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); + tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false); + } if (!nsk) { reqsk_put(req); goto discard_and_relse; } if (nsk == sk) { reqsk_put(req); + tcp_v4_restore_cb(skb); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; @@ -1712,6 +1727,7 @@ process: goto discard_and_relse; th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); + tcp_v4_fill_cb(skb, iph, th); skb->dev = NULL; @@ -1742,6 +1758,8 @@ no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; + tcp_v4_fill_cb(skb, iph, th); + if (tcp_checksum_complete(skb)) { csum_error: __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); @@ -1768,6 +1786,8 @@ do_time_wait: goto discard_it; } + tcp_v4_fill_cb(skb, iph, th); + if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; @@ -1784,6 +1804,7 @@ do_time_wait: if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; + tcp_v4_restore_cb(skb); refcounted = false; goto process; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index be11dc13aa70..1f04ec0e4a7a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1454,7 +1454,6 @@ process: struct sock *nsk; sk = req->rsk_listener; - tcp_v6_fill_cb(skb, hdr, th); if (tcp_v6_inbound_md5_hash(sk, skb)) { sk_drops_add(sk, skb); reqsk_put(req); @@ -1467,8 +1466,12 @@ process: sock_hold(sk); refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; + hdr = ipv6_hdr(skb); + tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false); + } if (!nsk) { reqsk_put(req); goto discard_and_relse; @@ -1492,8 +1495,6 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - tcp_v6_fill_cb(skb, hdr, th); - if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; @@ -1501,6 +1502,7 @@ process: goto discard_and_relse; th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); + tcp_v6_fill_cb(skb, hdr, th); skb->dev = NULL; -- cgit v1.2.3 From 4b380c42f7d00a395feede754f0bc2292eebe6e5 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 3 Dec 2017 12:12:45 -0800 Subject: netfilter: nfnetlink_cthelper: Add missing permission checks The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, nfnl_cthelper_list is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: $ nfct helper list nfct v1.4.4: netlink error: Operation not permitted $ vpnns -- nfct helper list { .name = ftp, .queuenum = 0, .l3protonum = 2, .l4protonum = 6, .priv_data_len = 24, .status = enabled, }; Add capable() checks in nfnetlink_cthelper, as this is cleaner than trying to generalize the solution. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cthelper.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 41628b393673..d33ce6d5ebce 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -407,6 +408,9 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; int ret = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; @@ -611,6 +615,9 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; bool tuple_set = false; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, @@ -678,6 +685,9 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth, *n; int j = 0, ret; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); -- cgit v1.2.3 From 6ab405114b0b229151ef06f4e31c7834dd09d0c0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 1 Dec 2017 01:46:07 +0100 Subject: netfilter: xt_bpf: add overflow checks Check whether inputs from userspace are too long (explicit length field too big or string not null-terminated) to avoid out-of-bounds reads. As far as I can tell, this can at worst lead to very limited kernel heap memory disclosure or oopses. This bug can be triggered by an unprivileged user even if the xt_bpf module is not loaded: iptables is available in network namespaces, and the xt_bpf module can be autoloaded. Triggering the bug with a classic BPF filter with fake length 0x1000 causes the following KASAN report: ================================================================== BUG: KASAN: slab-out-of-bounds in bpf_prog_create+0x84/0xf0 Read of size 32768 at addr ffff8801eff2c494 by task test/4627 CPU: 0 PID: 4627 Comm: test Not tainted 4.15.0-rc1+ #1 [...] Call Trace: dump_stack+0x5c/0x85 print_address_description+0x6a/0x260 kasan_report+0x254/0x370 ? bpf_prog_create+0x84/0xf0 memcpy+0x1f/0x50 bpf_prog_create+0x84/0xf0 bpf_mt_check+0x90/0xd6 [xt_bpf] [...] Allocated by task 4627: kasan_kmalloc+0xa0/0xd0 __kmalloc_node+0x47/0x60 xt_alloc_table_info+0x41/0x70 [x_tables] [...] The buggy address belongs to the object at ffff8801eff2c3c0 which belongs to the cache kmalloc-2048 of size 2048 The buggy address is located 212 bytes inside of 2048-byte region [ffff8801eff2c3c0, ffff8801eff2cbc0) [...] ================================================================== Fixes: e6f30c731718 ("netfilter: x_tables: add xt_bpf match") Signed-off-by: Jann Horn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_bpf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 041da0d9c06f..1f7fbd3c7e5a 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len, { struct sock_fprog_kern program; + if (len > XT_BPF_MAX_NUM_INSTR) + return -EINVAL; + program.len = len; program.filter = insns; @@ -55,6 +58,9 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) mm_segment_t oldfs = get_fs(); int retval, fd; + if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX) + return -EINVAL; + set_fs(KERNEL_DS); fd = bpf_obj_get_user(path, 0); set_fs(oldfs); -- cgit v1.2.3 From 5ba7dcfe77037b67016263ea597a8b431692ecab Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 3 Dec 2017 11:26:45 +0100 Subject: batman-adv: Fix lock for ogm cnt access in batadv_iv_ogm_calc_tq The originator node object orig_neigh_node is used to when accessing the bcast_own(_sum) and real_packet_count information. The access to them has to be protected with the spinlock in orig_neigh_node. But the function uses the lock in orig_node instead. This is incorrect because they could be two different originator node objects. Fixes: 0ede9f41b217 ("batman-adv: protect bit operations to count OGMs with spinlock") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 1b659ab652fb..bbe8414b6ee7 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1214,7 +1214,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); if_num = if_incoming->if_num; orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num]; neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); @@ -1224,7 +1224,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, } else { neigh_rq_count = 0; } - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) -- cgit v1.2.3 From e599ea1410c3a2f55716f9c309587235cca32025 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Dec 2017 15:28:44 -0800 Subject: Revert "tcp: must block bh in __inet_twsk_hashdance()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We had to disable BH _before_ calling __inet_twsk_hashdance() in commit cfac7f836a71 ("tcp/dccp: block bh before arming time_wait timer"). This means we can revert 614bdd4d6e61 ("tcp: must block bh in __inet_twsk_hashdance()"). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c690cd0d9b3f..b563e0c46bac 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -93,7 +93,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, } /* - * Enter the time wait state. + * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ @@ -111,7 +111,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; - spin_lock_bh(&bhead->lock); + spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); @@ -137,7 +137,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - spin_unlock_bh(lock); + spin_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -- cgit v1.2.3 From 029b6d1405504984b9d2661110ff1a17467d3426 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 2 Dec 2017 08:41:55 +0100 Subject: Revert "net: core: maybe return -EEXIST in __dev_alloc_name" This reverts commit d6f295e9def0; some userspace (in the case we noticed it's wpa_supplicant), is relying on the current error code to determine that a fixed name interface already exists. Reported-by: Jouni Malinen Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 07ed21d64f92..f47e96b62308 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1106,7 +1106,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) * when the name is long and there isn't enough space left * for the digits, or if all bits are used. */ - return p ? -ENFILE : -EEXIST; + return -ENFILE; } static int dev_alloc_name_ns(struct net *net, -- cgit v1.2.3 From 8afa10cbe281b10371fee5a87ab266e48d71a7f9 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 4 Dec 2017 13:31:11 +0200 Subject: net_sched: red: Avoid illegal values Check the qmin & qmax values doesn't overflow for the given Wlog value. Check that qmin <= qmax. Fixes: a783474591f2 ("[PKT_SCHED]: Generic RED layer") Signed-off-by: Nogah Frankel Signed-off-by: David S. Miller --- net/sched/sch_choke.c | 3 +++ net/sched/sch_gred.c | 3 +++ net/sched/sch_red.c | 2 ++ net/sched/sch_sfq.c | 3 +++ 4 files changed, 11 insertions(+) (limited to 'net') diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index b30a2c70bd48..531250fceb9e 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -369,6 +369,9 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) ctl = nla_data(tb[TCA_CHOKE_PARMS]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + return -EINVAL; + if (ctl->limit > CHOKE_MAX_QUEUE) return -EINVAL; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 17c7130454bd..bc30f9186ac6 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -356,6 +356,9 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + return -EINVAL; + if (!q) { table->tab[dp] = q = *prealloc; *prealloc = NULL; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 7f8ea9e297c3..9d874e60e032 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -212,6 +212,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; ctl = nla_data(tb[TCA_RED_PARMS]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + return -EINVAL; if (ctl->limit > 0) { child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 09c1203c1711..930e5bd26d3d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -639,6 +639,9 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; + if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, + ctl_v1->Wlog)) + return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) -- cgit v1.2.3 From 672ecbe1c977616aa720c9397589665b33e72610 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 4 Dec 2017 10:31:43 -0800 Subject: tipc: fix a null pointer deref on error path In tipc_topsrv_kern_subscr() when s->tipc_conn_new() fails we call tipc_close_conn() to clean up, but in this case calling conn_put() is just enough. This fixes the folllowing crash: kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 3085 Comm: syzkaller064164 Not tainted 4.15.0-rc1+ #137 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: 00000000c24413a5 task.stack: 000000005e8160b5 RIP: 0010:__lock_acquire+0xd55/0x47f0 kernel/locking/lockdep.c:3378 RSP: 0018:ffff8801cb5474a8 EFLAGS: 00010002 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000004 RSI: 0000000000000000 RDI: ffffffff85ecb400 RBP: ffff8801cb547830 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: ffffffff87489d60 R12: ffff8801cd2980c0 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000020 FS: 00000000014ee880(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffee2426e40 CR3: 00000001cb85a000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:4004 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline] _raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:175 spin_lock_bh include/linux/spinlock.h:320 [inline] tipc_subscrb_subscrp_delete+0x8f/0x470 net/tipc/subscr.c:201 tipc_subscrb_delete net/tipc/subscr.c:238 [inline] tipc_subscrb_release_cb+0x17/0x30 net/tipc/subscr.c:316 tipc_close_conn+0x171/0x270 net/tipc/server.c:204 tipc_topsrv_kern_subscr+0x724/0x810 net/tipc/server.c:514 tipc_group_create+0x702/0x9c0 net/tipc/group.c:184 tipc_sk_join net/tipc/socket.c:2747 [inline] tipc_setsockopt+0x249/0xc10 net/tipc/socket.c:2861 SYSC_setsockopt net/socket.c:1851 [inline] SyS_setsockopt+0x189/0x360 net/socket.c:1830 entry_SYSCALL_64_fastpath+0x1f/0x96 Fixes: 14c04493cb77 ("tipc: add ability to order and receive topology events in driver") Reported-by: syzbot Cc: Jon Maloy Cc: Ying Xue Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/tipc/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index acaef80fb88c..2710101ba4c1 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -511,7 +511,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, s = con->server; scbr = s->tipc_conn_new(*conid); if (!scbr) { - tipc_close_conn(con); + conn_put(con); return false; } -- cgit v1.2.3 From a7d5f107b4978e08eeab599ee7449af34d034053 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 4 Dec 2017 22:00:20 +0100 Subject: tipc: fix memory leak in tipc_accept_from_sock() When the function tipc_accept_from_sock() fails to create an instance of struct tipc_subscriber it omits to free the already created instance of struct tipc_conn instance before it returns. We fix that with this commit. Reported-by: David S. Miller Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/server.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 2710101ba4c1..d60c30342327 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -314,6 +314,7 @@ static int tipc_accept_from_sock(struct tipc_conn *con) newcon->usr_data = s->tipc_conn_new(newcon->conid); if (!newcon->usr_data) { sock_release(newsock); + conn_put(newcon); return -ENOMEM; } -- cgit v1.2.3 From c9d3fe9da094a9a7a3d3cd365b334b822e05f5e8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 5 Dec 2017 11:31:14 +0000 Subject: VSOCK: fix outdated sk_state value in hvs_release() Since commit 3b4477d2dcf2709d0be89e2a8dced3d0f4a017f2 ("VSOCK: use TCP state constants for sk_state") VSOCK has used TCP_* constants for sk_state. Commit b4562ca7925a3bedada87a3dd072dd5bad043288 ("hv_sock: add locking in the open/close/release code paths") reintroduced the SS_DISCONNECTING constant. This patch replaces the old SS_DISCONNECTING with the new TCP_CLOSING constant. CC: Dexuan Cui CC: Cathy Avery Signed-off-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: David S. Miller --- net/vmw_vsock/hyperv_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 5583df708b8c..a827547aa102 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -487,7 +487,7 @@ static void hvs_release(struct vsock_sock *vsk) lock_sock(sk); - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; vsock_remove_sock(vsk); release_sock(sk); -- cgit v1.2.3 From 69c64866ce072dea1d1e59a0d61e0f66c0dffb76 Mon Sep 17 00:00:00 2001 From: Mohamed Ghannam Date: Tue, 5 Dec 2017 20:58:35 +0000 Subject: dccp: CVE-2017-8824: use-after-free in DCCP code Whenever the sock object is in DCCP_CLOSED state, dccp_disconnect() must free dccps_hc_tx_ccid and dccps_hc_rx_ccid and set to NULL. Signed-off-by: Mohamed Ghannam Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/proto.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b68168fcc06a..9d43c1f40274 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -259,6 +259,7 @@ int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); int err = 0; const int old_state = sk->sk_state; @@ -278,6 +279,10 @@ int dccp_disconnect(struct sock *sk, int flags) sk->sk_err = ECONNRESET; dccp_clear_xmit_timers(sk); + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = NULL; + dp->dccps_hc_tx_ccid = NULL; __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_write_queue); -- cgit v1.2.3 From a5739435b5a3b8c449f8844ecd71a3b1e89f0a33 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Dec 2017 23:27:57 +0000 Subject: fix kcm_clone() 1) it's fput() or sock_release(), not both 2) don't do fd_install() until the last failure exit. 3) not a bug per se, but... don't attach socket to struct file until it's set up. Take reserving descriptor into the caller, move fd_install() to the caller, sanitize failure exits and calling conventions. Cc: stable@vger.kernel.org # v4.6+ Acked-by: Tom Herbert Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 71 +++++++++++++++++++++---------------------------------- 1 file changed, 27 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 0b750a22c4b9..c5fa634e63ca 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1625,60 +1625,35 @@ static struct proto kcm_proto = { }; /* Clone a kcm socket. */ -static int kcm_clone(struct socket *osock, struct kcm_clone *info, - struct socket **newsockp) +static struct file *kcm_clone(struct socket *osock) { struct socket *newsock; struct sock *newsk; - struct file *newfile; - int err, newfd; + struct file *file; - err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out; + return ERR_PTR(-ENFILE); newsock->type = osock->type; newsock->ops = osock->ops; __module_get(newsock->ops->owner); - newfd = get_unused_fd_flags(0); - if (unlikely(newfd < 0)) { - err = newfd; - goto out_fd_fail; - } - - newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (IS_ERR(newfile)) { - err = PTR_ERR(newfile); - goto out_sock_alloc_fail; - } - newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, &kcm_proto, true); if (!newsk) { - err = -ENOMEM; - goto out_sk_alloc_fail; + sock_release(newsock); + return ERR_PTR(-ENOMEM); } - sock_init_data(newsock, newsk); init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); - fd_install(newfd, newfile); - *newsockp = newsock; - info->fd = newfd; - - return 0; + file = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); + if (IS_ERR(file)) + sock_release(newsock); -out_sk_alloc_fail: - fput(newfile); -out_sock_alloc_fail: - put_unused_fd(newfd); -out_fd_fail: - sock_release(newsock); -out: - return err; + return file; } static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) @@ -1708,17 +1683,25 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCKCMCLONE: { struct kcm_clone info; - struct socket *newsock = NULL; - - err = kcm_clone(sock, &info, &newsock); - if (!err) { - if (copy_to_user((void __user *)arg, &info, - sizeof(info))) { - err = -EFAULT; - sys_close(info.fd); - } - } + struct file *file; + + info.fd = get_unused_fd_flags(0); + if (unlikely(info.fd < 0)) + return info.fd; + file = kcm_clone(sock); + if (IS_ERR(file)) { + put_unused_fd(info.fd); + return PTR_ERR(file); + } + if (copy_to_user((void __user *)arg, &info, + sizeof(info))) { + put_unused_fd(info.fd); + fput(file); + return -EFAULT; + } + fd_install(info.fd, file); + err = 0; break; } default: -- cgit v1.2.3 From 016a266bdfeda268afb2228b6217fd4771334635 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Dec 2017 23:28:38 +0000 Subject: socketpair(): allocate descriptors first simplifies failure exits considerably... Reviewed-by: Eric Dumazet Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/socket.c | 89 ++++++++++++++++++++++++++---------------------------------- 1 file changed, 38 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 42d8e9c9ccd5..2df83c0bfde9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1365,88 +1365,75 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + /* + * reserve descriptors and make sure we won't fail + * to return them to userland. + */ + fd1 = get_unused_fd_flags(flags); + if (unlikely(fd1 < 0)) + return fd1; + + fd2 = get_unused_fd_flags(flags); + if (unlikely(fd2 < 0)) { + put_unused_fd(fd1); + return fd2; + } + + err = put_user(fd1, &usockvec[0]); + if (err) + goto out; + + err = put_user(fd2, &usockvec[1]); + if (err) + goto out; + /* * Obtain the first socket and check if the underlying protocol * supports the socketpair call. */ err = sock_create(family, type, protocol, &sock1); - if (err < 0) + if (unlikely(err < 0)) goto out; err = sock_create(family, type, protocol, &sock2); - if (err < 0) - goto out_release_1; - - err = sock1->ops->socketpair(sock1, sock2); - if (err < 0) - goto out_release_both; - - fd1 = get_unused_fd_flags(flags); - if (unlikely(fd1 < 0)) { - err = fd1; - goto out_release_both; + if (unlikely(err < 0)) { + sock_release(sock1); + goto out; } - fd2 = get_unused_fd_flags(flags); - if (unlikely(fd2 < 0)) { - err = fd2; - goto out_put_unused_1; + err = sock1->ops->socketpair(sock1, sock2); + if (unlikely(err < 0)) { + sock_release(sock2); + sock_release(sock1); + goto out; } newfile1 = sock_alloc_file(sock1, flags, NULL); if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); - goto out_put_unused_both; + sock_release(sock1); + sock_release(sock2); + goto out; } newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); - goto out_fput_1; + sock_release(sock2); + fput(newfile1); + goto out; } - err = put_user(fd1, &usockvec[0]); - if (err) - goto out_fput_both; - - err = put_user(fd2, &usockvec[1]); - if (err) - goto out_fput_both; - audit_fd_pair(fd1, fd2); fd_install(fd1, newfile1); fd_install(fd2, newfile2); - /* fd1 and fd2 may be already another descriptors. - * Not kernel problem. - */ - return 0; -out_fput_both: - fput(newfile2); - fput(newfile1); - put_unused_fd(fd2); - put_unused_fd(fd1); - goto out; - -out_fput_1: - fput(newfile1); - put_unused_fd(fd2); - put_unused_fd(fd1); - sock_release(sock2); - goto out; - -out_put_unused_both: +out: put_unused_fd(fd2); -out_put_unused_1: put_unused_fd(fd1); -out_release_both: - sock_release(sock2); -out_release_1: - sock_release(sock1); -out: return err; } -- cgit v1.2.3 From 8e1611e2357927b22892ecc062d65c99d0d89066 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Dec 2017 23:29:09 +0000 Subject: make sock_alloc_file() do sock_release() on failures This changes calling conventions (and simplifies the hell out the callers). New rules: once struct socket had been passed to sock_alloc_file(), it's been consumed either by struct file or by sock_release() done by sock_alloc_file(). Either way the caller should not do sock_release() after that point. Reviewed-by: Eric Dumazet Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/9p/trans_fd.c | 1 - net/kcm/kcmsock.c | 7 +------ net/sctp/socket.c | 1 - net/socket.c | 25 ++++++++----------------- 4 files changed, 9 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 985046ae4231..80f5c79053a4 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -839,7 +839,6 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) if (IS_ERR(file)) { pr_err("%s (%d): failed to map fd\n", __func__, task_pid_nr(current)); - sock_release(csocket); kfree(p); return PTR_ERR(file); } diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index c5fa634e63ca..d4e98f20fc2a 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1629,7 +1629,6 @@ static struct file *kcm_clone(struct socket *osock) { struct socket *newsock; struct sock *newsk; - struct file *file; newsock = sock_alloc(); if (!newsock) @@ -1649,11 +1648,7 @@ static struct file *kcm_clone(struct socket *osock) sock_init_data(newsock, newsk); init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); - file = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (IS_ERR(file)) - sock_release(newsock); - - return file; + return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); } static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 014847e25648..eb17a911aa29 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5080,7 +5080,6 @@ static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *p *newfile = sock_alloc_file(newsock, 0, NULL); if (IS_ERR(*newfile)) { put_unused_fd(retval); - sock_release(newsock); retval = PTR_ERR(*newfile); *newfile = NULL; return retval; diff --git a/net/socket.c b/net/socket.c index 2df83c0bfde9..05f361faec45 100644 --- a/net/socket.c +++ b/net/socket.c @@ -406,8 +406,10 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) name.len = strlen(name.name); } path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); - if (unlikely(!path.dentry)) + if (unlikely(!path.dentry)) { + sock_release(sock); return ERR_PTR(-ENOMEM); + } path.mnt = mntget(sock_mnt); d_instantiate(path.dentry, SOCK_INODE(sock)); @@ -415,9 +417,11 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops); if (IS_ERR(file)) { - /* drop dentry, keep inode */ + /* drop dentry, keep inode for a bit */ ihold(d_inode(path.dentry)); path_put(&path); + /* ... and now kill it properly */ + sock_release(sock); return file; } @@ -1330,19 +1334,9 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) retval = sock_create(family, type, protocol, &sock); if (retval < 0) - goto out; - - retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); - if (retval < 0) - goto out_release; - -out: - /* It may be already another descriptor 8) Not kernel problem. */ - return retval; + return retval; -out_release: - sock_release(sock); - return retval; + return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); } /* @@ -1412,7 +1406,6 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, newfile1 = sock_alloc_file(sock1, flags, NULL); if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); - sock_release(sock1); sock_release(sock2); goto out; } @@ -1420,7 +1413,6 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); - sock_release(sock2); fput(newfile1); goto out; } @@ -1549,7 +1541,6 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, if (IS_ERR(newfile)) { err = PTR_ERR(newfile); put_unused_fd(newfd); - sock_release(newsock); goto out_put; } -- cgit v1.2.3 From 71334963d01ed7ec61a958a5a6585172793f5a24 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Dec 2017 11:27:59 +0100 Subject: wireless: replace usage of hexdump with od/sed Since od/sed are in posix, hopefully there's a better chance people will have them, over hexdump. Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Signed-off-by: Johannes Berg --- net/wireless/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 278d979c211a..63cbb6432b2d 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -27,7 +27,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) @$(kecho) " GEN $@" @echo '#include "reg.h"' > $@ @echo 'const u8 shipped_regdb_certs[] = {' >> $@ - @for f in $^ ; do hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ ; done + @for f in $^ ; do od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done @echo '};' >> $@ @echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@ @@ -36,6 +36,6 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ @$(kecho) " GEN $@" @echo '#include "reg.h"' > $@ @echo 'const u8 extra_regdb_certs[] = {' >> $@ - @for f in $^ ; do test -f $$f && hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ || true ; done + @for f in $^ ; do test -f $$f && od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done @echo '};' >> $@ @echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@ -- cgit v1.2.3 From 715a12334764657bafb3ab964fb25f4e6115c770 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Dec 2017 11:59:33 +0100 Subject: wireless: don't write C files on failures Change the scripting inside the shipped/extra certs C code generation to not write the file when there are any failures. That way, if the build aborts due to failures, we don't get into a situation where a dummy file has been created and the next build succeeds, but not with the desired output. Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Signed-off-by: Johannes Berg --- net/wireless/Makefile | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 63cbb6432b2d..d7d6cb00c47b 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -25,17 +25,45 @@ endif $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) @$(kecho) " GEN $@" - @echo '#include "reg.h"' > $@ - @echo 'const u8 shipped_regdb_certs[] = {' >> $@ - @for f in $^ ; do od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done - @echo '};' >> $@ - @echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@ + @(set -e; \ + allf=""; \ + for f in $^ ; do \ + # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \ + thisf=$$(od -An -v -tx1 < $$f | \ + sed -e 's/ /\n/g' | \ + sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \ + sed -e 's/^/0x/;s/$$/,/'); \ + # file should not be empty - maybe command substitution failed? \ + test ! -z "$$thisf";\ + allf=$$allf$$thisf;\ + done; \ + ( \ + echo '#include "reg.h"'; \ + echo 'const u8 shipped_regdb_certs[] = {'; \ + echo "$$allf"; \ + echo '};'; \ + echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ + ) >> $@) $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) @$(kecho) " GEN $@" - @echo '#include "reg.h"' > $@ - @echo 'const u8 extra_regdb_certs[] = {' >> $@ - @for f in $^ ; do test -f $$f && od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done - @echo '};' >> $@ - @echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@ + @(set -e; \ + allf=""; \ + for f in $^ ; do \ + # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \ + thisf=$$(od -An -v -tx1 < $$f | \ + sed -e 's/ /\n/g' | \ + sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \ + sed -e 's/^/0x/;s/$$/,/'); \ + # file should not be empty - maybe command substitution failed? \ + test ! -z "$$thisf";\ + allf=$$allf$$thisf;\ + done; \ + ( \ + echo '#include "reg.h"'; \ + echo 'const u8 extra_regdb_certs[] = {'; \ + echo "$$allf"; \ + echo '};'; \ + echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \ + ) >> $@) -- cgit v1.2.3 From 916a27901de01446bcf57ecca4783f6cff493309 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Tue, 5 Dec 2017 15:42:41 -0800 Subject: netfilter: xt_osf: Add missing permission checks The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, xt_osf_fingers is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: vpnns -- nfnl_osf -f /tmp/pf.os vpnns -- nfnl_osf -f /tmp/pf.os -d These non-root operations successfully modify the systemwide OS fingerprint list. Add new capable() checks so that they can't. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_osf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 36e14b1f061d..a34f314a8c23 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -70,6 +71,9 @@ static int xt_osf_add_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *kf = NULL, *sf; int err = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; @@ -115,6 +119,9 @@ static int xt_osf_remove_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *sf; int err = -ENOENT; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; -- cgit v1.2.3 From 32d3e51a82d453762ef148b2c4fbc8a7ec374a88 Mon Sep 17 00:00:00 2001 From: Chris Dion Date: Wed, 6 Dec 2017 10:50:28 -0500 Subject: net_sched: use macvlan real dev trans_start in dev_trans_start() Macvlan devices are similar to vlans and do not update their own trans_start. In order for arp monitoring to work for a bond device when the slaves are macvlans, obtain its real device. Signed-off-by: Chris Dion Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 3839cbbdc32b..cd1b200acae7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -277,6 +278,8 @@ unsigned long dev_trans_start(struct net_device *dev) if (is_vlan_dev(dev)) dev = vlan_dev_real_dev(dev); + else if (netif_is_macvlan(dev)) + dev = macvlan_dev_real_dev(dev); res = netdev_get_tx_queue(dev, 0)->trans_start; for (i = 1; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; -- cgit v1.2.3 From f3069c6d33f6ae63a1668737bc78aaaa51bff7ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Wed, 6 Dec 2017 17:18:28 +0100 Subject: rds: Fix NULL pointer dereference in __rds_rdma_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a fix for syzkaller719569, where memory registration was attempted without any underlying transport being loaded. Analysis of the case reveals that it is the setsockopt() RDS_GET_MR (2) and RDS_GET_MR_FOR_DEST (7) that are vulnerable. Here is an example stack trace when the bug is hit: BUG: unable to handle kernel NULL pointer dereference at 00000000000000c0 IP: __rds_rdma_map+0x36/0x440 [rds] PGD 2f93d03067 P4D 2f93d03067 PUD 2f93d02067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: bridge stp llc tun rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache rds binfmt_misc sb_edac intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul c rc32_pclmul ghash_clmulni_intel pcbc aesni_intel crypto_simd glue_helper cryptd iTCO_wdt mei_me sg iTCO_vendor_support ipmi_si mei ipmi_devintf nfsd shpchp pcspkr i2c_i801 ioatd ma ipmi_msghandler wmi lpc_ich mfd_core auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 mgag200 i2c_algo_bit drm_kms_helper ixgbe syscopyarea ahci sysfillrect sysimgblt libahci mdio fb_sys_fops ttm ptp libata sd_mod mlx4_core drm crc32c_intel pps_core megaraid_sas i2c_core dca dm_mirror dm_region_hash dm_log dm_mod CPU: 48 PID: 45787 Comm: repro_set2 Not tainted 4.14.2-3.el7uek.x86_64 #2 Hardware name: Oracle Corporation ORACLE SERVER X5-2L/ASM,MOBO TRAY,2U, BIOS 31110000 03/03/2017 task: ffff882f9190db00 task.stack: ffffc9002b994000 RIP: 0010:__rds_rdma_map+0x36/0x440 [rds] RSP: 0018:ffffc9002b997df0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff882fa2182580 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffc9002b997e40 RDI: ffff882fa2182580 RBP: ffffc9002b997e30 R08: 0000000000000000 R09: 0000000000000002 R10: ffff885fb29e3838 R11: 0000000000000000 R12: ffff882fa2182580 R13: ffff882fa2182580 R14: 0000000000000002 R15: 0000000020000ffc FS: 00007fbffa20b700(0000) GS:ffff882fbfb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000c0 CR3: 0000002f98a66006 CR4: 00000000001606e0 Call Trace: rds_get_mr+0x56/0x80 [rds] rds_setsockopt+0x172/0x340 [rds] ? __fget_light+0x25/0x60 ? __fdget+0x13/0x20 SyS_setsockopt+0x80/0xe0 do_syscall_64+0x67/0x1b0 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7fbff9b117f9 RSP: 002b:00007fbffa20aed8 EFLAGS: 00000293 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000000c84a4 RCX: 00007fbff9b117f9 RDX: 0000000000000002 RSI: 0000400000000114 RDI: 000000000000109b RBP: 00007fbffa20af10 R08: 0000000000000020 R09: 00007fbff9dd7860 R10: 0000000020000ffc R11: 0000000000000293 R12: 0000000000000000 R13: 00007fbffa20b9c0 R14: 00007fbffa20b700 R15: 0000000000000021 Code: 41 56 41 55 49 89 fd 41 54 53 48 83 ec 18 8b 87 f0 02 00 00 48 89 55 d0 48 89 4d c8 85 c0 0f 84 2d 03 00 00 48 8b 87 00 03 00 00 <48> 83 b8 c0 00 00 00 00 0f 84 25 03 00 0 0 48 8b 06 48 8b 56 08 The fix is to check the existence of an underlying transport in __rds_rdma_map(). Signed-off-by: Håkon Bugge Reported-by: syzbot Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 8886f15abe90..bc2f1e0977d6 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -183,7 +183,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, long i; int ret; - if (rs->rs_bound_addr == 0) { + if (rs->rs_bound_addr == 0 || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } -- cgit v1.2.3 From 74c4b656c3d92ec4c824ea1a4afd726b7b6568c8 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Wed, 6 Dec 2017 17:15:43 -0800 Subject: adding missing rcu_read_unlock in ipxip6_rcv commit 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels") introduced new exit point in ipxip6_rcv. however rcu_read_unlock is missing there. this diff is fixing this v1->v2: instead of doing rcu_read_unlock in place, we are going to "drop" section (to prevent skb leakage) Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels") Signed-off-by: Nikita V. Shirokov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3d3092adf1d2..db84f523656d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -904,7 +904,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, if (t->parms.collect_md) { tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); if (!tun_dst) - return 0; + goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); -- cgit v1.2.3 From 8632385022f2b05a6ca0b9e0f95575865de0e2ce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Dec 2017 11:08:19 -0800 Subject: tcp: use current time in tcp_rcv_space_adjust() When I switched rcv_rtt_est to high resolution timestamps, I forgot that tp->tcp_mstamp needed to be refreshed in tcp_rcv_space_adjust() Using an old timestamp leads to autotuning lags. Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps") Signed-off-by: Eric Dumazet Cc: Wei Wang Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 734cfc8ff76e..514c00732988 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -579,6 +579,7 @@ void tcp_rcv_space_adjust(struct sock *sk) int time; int copied; + tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; -- cgit v1.2.3 From 96307a0a75d8f1847debefd6a402339aac43e224 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Dec 2017 14:26:09 +0100 Subject: netfilter: ipt_CLUSTERIP: fix clusterip_net_exit build regression The added check produces a build error when CONFIG_PROC_FS is disabled: net/ipv4/netfilter/ipt_CLUSTERIP.c: In function 'clusterip_net_exit': net/ipv4/netfilter/ipt_CLUSTERIP.c:822:28: error: 'cn' undeclared (first use in this function) This moves the variable declaration out of the #ifdef to make it available to the WARN_ON_ONCE(). Fixes: 613d0776d3fe ("netfilter: exit_net cleanup check added") Signed-off-by: Arnd Bergmann Reviewed-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index e35b8d074f06..69060e3abe85 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -813,8 +813,8 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { -#ifdef CONFIG_PROC_FS struct clusterip_net *cn = net_generic(net, clusterip_net_id); +#ifdef CONFIG_PROC_FS proc_remove(cn->procdir); cn->procdir = NULL; #endif -- cgit v1.2.3 From 75bf50f4aaa1c78d769d854ab3d975884909e4fb Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Thu, 7 Dec 2017 21:54:27 +0100 Subject: xfrm: fix xfrm_do_migrate() with AEAD e.g(AES-GCM) copy geniv when cloning the xfrm state. x->geniv was not copied to the new state and migration would fail. xfrm_do_migrate .. xfrm_state_clone() .. .. esp_init_aead() crypto_alloc_aead() crypto_alloc_tfm() crypto_find_alg() return EAGAIN and failed Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1f5cee2269af..88d0a563e141 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1344,6 +1344,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, if (orig->aead) { x->aead = xfrm_algo_aead_clone(orig->aead); + x->geniv = orig->geniv; if (!x->aead) goto error; } -- cgit v1.2.3 From 732706afe1cc46ef48493b3d2b69c98f36314ae4 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 8 Dec 2017 08:07:25 +0100 Subject: xfrm: Fix stack-out-of-bounds with misconfigured transport mode policies. On policies with a transport mode template, we pass the addresses from the flowi to xfrm_state_find(), assuming that the IP addresses (and address family) don't change during transformation. Unfortunately our policy template validation is not strict enough. It is possible to configure policies with transport mode template where the address family of the template does not match the selectors address family. This lead to stack-out-of-bound reads because we compare arddesses of the wrong family. Fix this by refusing such a configuration, address family can not change on transport mode. We use the assumption that, on transport mode, the first templates address family must match the address family of the policy selector. Subsequent transport mode templates must mach the address family of the previous template. Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index ff58c37469d6..bdb48e5dba04 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1419,11 +1419,14 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) { + u16 prev_family; int i; if (nr > XFRM_MAX_DEPTH) return -EINVAL; + prev_family = family; + for (i = 0; i < nr; i++) { /* We never validated the ut->family value, so many * applications simply leave it at zero. The check was @@ -1435,6 +1438,12 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) if (!ut[i].family) ut[i].family = family; + if ((ut[i].mode == XFRM_MODE_TRANSPORT) && + (ut[i].family != prev_family)) + return -EINVAL; + + prev_family = ut[i].family; + switch (ut[i].family) { case AF_INET: break; -- cgit v1.2.3 From d4761754b4fb2ef8d9a1e9d121c4bec84e1fe292 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Thu, 7 Dec 2017 13:41:34 -0800 Subject: tcp: invalidate rate samples during SACK reneging Mark tcp_sock during a SACK reneging event and invalidate rate samples while marked. Such rate samples may overestimate bw by including packets that were SACKed before reneging. < ack 6001 win 10000 sack 7001:38001 < ack 7001 win 0 sack 8001:38001 // Reneg detected > seq 7001:8001 // RTO, SACK cleared. < ack 38001 win 10000 In above example the rate sample taken after the last ack will count 7001-38001 as delivered while the actual delivery rate likely could be much lower i.e. 7001-8001. This patch adds a new field tcp_sock.sack_reneg and marks it when we declare SACK reneging and entering TCP_CA_Loss, and unmarks it after the last rate sample was taken before moving back to TCP_CA_Open. This patch also invalidates rate samples taken while tcp_sock.is_sack_reneg is set. Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection") Signed-off-by: Yousuk Seung Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Acked-by: Eric Dumazet Acked-by: Priyaranjan Jha Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 10 ++++++++-- net/ipv4/tcp_rate.c | 10 +++++++--- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bf97317e6c97..f08eebe60446 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2412,6 +2412,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; tcp_clear_retrans(tp); inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 514c00732988..075c559570e6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1942,6 +1942,8 @@ void tcp_enter_loss(struct sock *sk) if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; + /* Mark SACK reneging until we recover from this loss event. */ + tp->is_sack_reneg = 1; } tcp_clear_all_retrans_hints(tp); @@ -2365,6 +2367,7 @@ static bool tcp_try_undo_recovery(struct sock *sk) return true; } tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; return false; } @@ -2398,8 +2401,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; - if (frto_undo || tcp_is_sack(tp)) + if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; + } return true; } return false; @@ -3496,6 +3501,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) struct tcp_sacktag_state sack_state; struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; + bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; @@ -3612,7 +3618,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, sack_state.rate); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 3330a370d306..c61240e43923 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct rate_sample *rs) + bool is_sack_reneg, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; @@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available. */ - if (!rs->prior_mstamp) { + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { rs->delivered = -1; rs->interval_us = -1; return; -- cgit v1.2.3 From c589e69b508d29ed8e644dfecda453f71c02ec27 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 7 Dec 2017 12:43:30 -0500 Subject: tcp_bbr: record "full bw reached" decision in new full_bw_reached bit This commit records the "full bw reached" decision in a new full_bw_reached bit. This is a pure refactor that does not change the current behavior, but enables subsequent fixes and improvements. In particular, this enables simple and clean fixes because the full_bw and full_bw_cnt can be unconditionally zeroed without worrying about forgetting that we estimated we filled the pipe in Startup. And it enables future improvements because multiple code paths can be used for estimating that we filled the pipe in Startup; any new code paths only need to set this bit when they think the pipe is full. Note that this fix intentionally reduces the width of the full_bw_cnt counter, since we have never used the most significant bit. Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 69ee877574d0..3089c956b9f9 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -110,7 +110,8 @@ struct bbr { u32 lt_last_lost; /* LT intvl start: tp->lost */ u32 pacing_gain:10, /* current gain for setting pacing rate */ cwnd_gain:10, /* current gain for setting cwnd */ - full_bw_cnt:3, /* number of rounds without large bw gains */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ cycle_idx:3, /* current index in pacing_gain cycle array */ has_seen_rtt:1, /* have we seen an RTT sample yet? */ unused_b:5; @@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const struct sock *sk) { const struct bbr *bbr = inet_csk_ca(sk); - return bbr->full_bw_cnt >= bbr_full_bw_cnt; + return bbr->full_bw_reached; } /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ @@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(struct sock *sk, return; } ++bbr->full_bw_cnt; + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; } /* If pipe is probably full, drain the queue and then enter steady-state. */ @@ -850,6 +852,7 @@ static void bbr_init(struct sock *sk) bbr->restore_cwnd = 0; bbr->round_start = 0; bbr->idle_restart = 0; + bbr->full_bw_reached = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; bbr->cycle_mstamp = 0; -- cgit v1.2.3 From 2f6c498e4f15d27852c04ed46d804a39137ba364 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 7 Dec 2017 12:43:31 -0500 Subject: tcp_bbr: reset full pipe detection on loss recovery undo Fix BBR so that upon notification of a loss recovery undo BBR resets the full pipe detection (STARTUP exit) state machine. Under high reordering, reordering events can be interpreted as loss. If the reordering and spurious loss estimates are high enough, this could previously cause BBR to spuriously estimate that the pipe is full. Since spurious loss recovery means that our overall sending will have slowed down spuriously, this commit gives a flow more time to probe robustly for bandwidth and decide the pipe is really full. Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 3089c956b9f9..ab3ff14ea7f7 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -874,6 +874,10 @@ static u32 bbr_sndbuf_expand(struct sock *sk) */ static u32 bbr_undo_cwnd(struct sock *sk) { + struct bbr *bbr = inet_csk_ca(sk); + + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + bbr->full_bw_cnt = 0; return tcp_sk(sk)->snd_cwnd; } -- cgit v1.2.3 From 600647d467c6d04b3954b41a6ee1795b5ae00550 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 7 Dec 2017 12:43:32 -0500 Subject: tcp_bbr: reset long-term bandwidth sampling on loss recovery undo Fix BBR so that upon notification of a loss recovery undo BBR resets long-term bandwidth sampling. Under high reordering, reordering events can be interpreted as loss. If the reordering and spurious loss estimates are high enough, this can cause BBR to spuriously estimate that we are seeing loss rates high enough to trigger long-term bandwidth estimation. To avoid that problem, this commit resets long-term bandwidth sampling on loss recovery undo events. Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index ab3ff14ea7f7..8322f26e770e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -878,6 +878,7 @@ static u32 bbr_undo_cwnd(struct sock *sk) bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ bbr->full_bw_cnt = 0; + bbr_reset_lt_bw_sampling(sk); return tcp_sk(sk)->snd_cwnd; } -- cgit v1.2.3 From 0ce294d88457bccd7f9991f883fec80022a1ddbd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 7 Dec 2017 11:33:30 -0800 Subject: tcp: correctly test congestion state in RACK RACK does not test the loss recovery state correctly to compute the reordering window. It assumes if lost_out is zero then TCP is not in loss recovery. But it can be zero during recovery before calling tcp_rack_detect_loss(): when an ACK acknowledges all packets marked lost before receiving this ACK, but has not yet to discover new ones by tcp_rack_detect_loss(). The fix is to simply test the congestion state directly. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Priyaranjan Jha Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index d3ea89020c69..3143664902e9 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -55,7 +55,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) * to queuing or delayed ACKs. */ reo_wnd = 1000; - if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) { + if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) && + min_rtt != ~0U) { reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); reo_wnd = min(reo_wnd, tp->srtt_us >> 3); } -- cgit v1.2.3 From cd1fc85b4399d47e3d6626301741ba8c38cd475a Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 7 Dec 2017 11:33:31 -0800 Subject: tcp: always evaluate losses in RACK upon undo When sender detects spurious retransmission, all packets marked lost are remarked to be in-flight. However some may be considered lost based on its timestamps in RACK. This patch forces RACK to re-evaluate, which may be skipped previously if the ACK does not advance RACK timestamp. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Priyaranjan Jha Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 075c559570e6..9550cc42de2d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2329,6 +2329,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) } tp->snd_cwnd_stamp = tcp_jiffies32; tp->undo_marker = 0; + tp->rack.advanced = 1; /* Force RACK to re-exam losses */ } static inline bool tcp_may_undo(const struct tcp_sock *tp) -- cgit v1.2.3 From 428aec5e69fa17d223e1495f395833c50770f7ae Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 7 Dec 2017 11:33:32 -0800 Subject: tcp: fix off-by-one bug in RACK RACK should mark a packet lost when remaining wait time is zero. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Priyaranjan Jha Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 3143664902e9..0c182303e62e 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -80,12 +80,12 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) */ remaining = tp->rack.rtt_us + reo_wnd - tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); - if (remaining < 0) { + if (remaining <= 0) { tcp_rack_mark_skb_lost(sk, skb); list_del_init(&skb->tcp_tsorted_anchor); } else { - /* Record maximum wait time (+1 to avoid 0) */ - *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); + /* Record maximum wait time */ + *reo_timeout = max_t(u32, *reo_timeout, remaining); } } } -- cgit v1.2.3 From 6065fd0d179b96ddc488c76542349bcb148a95fd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 7 Dec 2017 11:33:33 -0800 Subject: tcp: evaluate packet losses upon RTT change RACK skips an ACK unless it advances the most recently delivered TX timestamp (rack.mstamp). Since RACK also uses the most recent RTT to decide if a packet is lost, RACK should still run the loss detection whenever the most recent RTT changes. For example, an ACK that does not advance the timestamp but triggers the cwnd undo due to reordering, would then use the most recent (higher) RTT measurement to detect further losses. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Priyaranjan Jha Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 0c182303e62e..3a81720ac0c4 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -117,13 +117,8 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, { u32 rtt_us; - if (tp->rack.mstamp && - !tcp_rack_sent_after(xmit_time, tp->rack.mstamp, - end_seq, tp->rack.end_seq)) - return; - rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); - if (sacked & TCPCB_RETRANS) { + if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. @@ -134,13 +129,15 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ - if (rtt_us < tcp_min_rtt(tp)) - return; + return; } - tp->rack.rtt_us = rtt_us; - tp->rack.mstamp = xmit_time; - tp->rack.end_seq = end_seq; tp->rack.advanced = 1; + tp->rack.rtt_us = rtt_us; + if (tcp_rack_sent_after(xmit_time, tp->rack.mstamp, + end_seq, tp->rack.end_seq)) { + tp->rack.mstamp = xmit_time; + tp->rack.end_seq = end_seq; + } } /* We have waited long enough to accommodate reordering. Mark the expired -- cgit v1.2.3 From 0afe9d4ab9d40c281bdcdd118661fe8e4bdcef18 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 9 Dec 2017 21:10:10 +0100 Subject: mac80211: fix locking in ieee80211_sta_tear_down_BA_sessions Due to overlap between commit 1281103770e9 ("mac80211: Simplify locking in ieee80211_sta_tear_down_BA_sessions()") and the way that Luca modified commit 72e2c3438ba3 ("mac80211: tear down RX aggregations first") when sending it upstream from Intel's internal tree, we get the following warning: WARNING: CPU: 0 PID: 5472 at net/mac80211/agg-tx.c:315 ___ieee80211_stop_tx_ba_session+0x158/0x1f0 since there's no appropriate locking around the call to ___ieee80211_stop_tx_ba_session; Sara's original just had a call to the locked __ieee80211_stop_tx_ba_session (one less underscore) but it looks like Luca modified both of the calls when fixing it up for upstream, leading to the problem at hand. Move the locking appropriately to fix this problem. Reported-by: Kalle Valo Reported-by: Pavel Machek Tested-by: Pavel Machek Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 167f83b853e6..1621b6ab17ba 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -291,16 +291,15 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, int i; mutex_lock(&sta->ampdu_mlme.mtx); - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + for (i = 0; i < IEEE80211_NUM_TIDS; i++) ___ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_LEAVE_QBSS, reason != AGG_STOP_DESTROY_STA && reason != AGG_STOP_PEER_REQUEST); - } - mutex_unlock(&sta->ampdu_mlme.mtx); for (i = 0; i < IEEE80211_NUM_TIDS; i++) ___ieee80211_stop_tx_ba_session(sta, i, reason); + mutex_unlock(&sta->ampdu_mlme.mtx); /* stopping might queue the work again - so cancel only afterwards */ cancel_work_sync(&sta->ampdu_mlme.work); -- cgit v1.2.3 From 4564b187c16327045d87596e8980c65ba7b84c50 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 11 Dec 2017 12:33:47 +0100 Subject: nl80211: fix nl80211_send_iface() error paths Evidently I introduced a locking bug in my change here, the nla_put_failure sometimes needs to unlock. Fix it. Fixes: 44905265bc15 ("nl80211: don't expose wdev->ssid for most interfaces") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b1ac23ca20c8..213d0c498c97 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2610,7 +2610,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag case NL80211_IFTYPE_AP: if (wdev->ssid_len && nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) - goto nla_put_failure; + goto nla_put_failure_locked; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: @@ -2623,7 +2623,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (!ssid_ie) break; if (nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) - goto nla_put_failure; + goto nla_put_failure_locked; break; } default: @@ -2635,6 +2635,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag genlmsg_end(msg, hdr); return 0; + nla_put_failure_locked: + wdev_unlock(wdev); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; -- cgit v1.2.3 From f5b5702ac55b11113a94d6228d191c7f827b7a3b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Dec 2017 10:14:27 +0100 Subject: netfilter: exthdr: add missign attributes to policy Add missing netlink attribute policy. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a0a93d987a3b..47ec1046ad11 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -214,6 +214,8 @@ static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, + [NFTA_EXTHDR_OP] = { .type = NLA_U32 }, + [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; static int nft_exthdr_init(const struct nft_ctx *ctx, -- cgit v1.2.3 From 23715275e4fb6f64358a499d20928a9e93819f2f Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 11 Dec 2017 18:19:33 +0300 Subject: netfilter: ip6t_MASQUERADE: add dependency on conntrack module After commit 4d3a57f23dec ("netfilter: conntrack: do not enable connection tracking unless needed") conntrack is disabled by default unless some module explicitly declares dependency in particular network namespace. Fixes: a357b3f80bc8 ("netfilter: nat: add dependencies on conntrack module") Signed-off-by: Konstantin Khlebnikov Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_MASQUERADE.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 2b1a15846f9a..92c0047e7e33 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -33,13 +33,19 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; - return 0; + return nf_ct_netns_get(par->net, par->family); +} + +static void masquerade_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } static struct xt_target masquerade_tg6_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV6, .checkentry = masquerade_tg6_checkentry, + .destroy = masquerade_tg6_destroy, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", -- cgit v1.2.3 From 93c647643b48f0131f02e45da3bd367d80443291 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Wed, 6 Dec 2017 12:12:27 -0800 Subject: netlink: Add netns check on taps Currently, a nlmon link inside a child namespace can observe systemwide netlink activity. Filter the traffic so that nlmon can only sniff netlink messages from its own netns. Test case: vpnns -- bash -c "ip link add nlmon0 type nlmon; \ ip link set nlmon0 up; \ tcpdump -i nlmon0 -q -w /tmp/nlmon.pcap -U" & sudo ip xfrm state add src 10.1.1.1 dst 10.1.1.2 proto esp \ spi 0x1 mode transport \ auth sha1 0x6162633132330000000000000000000000000000 \ enc aes 0x00000000000000000000000000000000 grep --binary abc123 /tmp/nlmon.pcap Signed-off-by: Kevin Cernekee Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b9e0ee4e22f5..79cc1bf36e4a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -253,6 +253,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct sock *sk = skb->sk; int ret = -ENOMEM; + if (!net_eq(dev_net(dev), sock_net(sk))) + return 0; + dev_hold(dev); if (is_vmalloc_addr(skb->head)) -- cgit v1.2.3 From 8f659a03a0ba9289b9aeb9b4470e6fb263d6f483 Mon Sep 17 00:00:00 2001 From: Mohamed Ghannam Date: Sun, 10 Dec 2017 03:50:58 +0000 Subject: net: ipv4: fix for a race condition in raw_sendmsg inet->hdrincl is racy, and could lead to uninitialized stack pointer usage, so its value should be read only once. Fixes: c008ba5bdc9f ("ipv4: Avoid reading user iov twice after raw_probe_proto_opt") Signed-off-by: Mohamed Ghannam Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/raw.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33b70bfd1122..125c1eab3eaa 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; + int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; + /* hdrincl should be READ_ONCE(inet->hdrincl) + * but READ_ONCE() doesn't work with bit fields + */ + hdrincl = inet->hdrincl; /* * Check the flags. */ @@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ - if (inet->hdrincl) + if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) @@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | - (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), + (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); - if (!inet->hdrincl) { + if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_confirm; back_from_confirm: - if (inet->hdrincl) + if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); -- cgit v1.2.3 From 2342b8d95bcae5946e1b9b8d58645f37500ef2e7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 10 Dec 2017 15:40:51 +0800 Subject: sctp: make sure stream nums can match optlen in sctp_setsockopt_reset_streams Now in sctp_setsockopt_reset_streams, it only does the check optlen < sizeof(*params) for optlen. But it's not enough, as params->srs_number_streams should also match optlen. If the streams in params->srs_stream_list are less than stream nums in params->srs_number_streams, later when dereferencing the stream list, it could cause a slab-out-of-bounds crash, as reported by syzbot. This patch is to fix it by also checking the stream numbers in sctp_setsockopt_reset_streams to make sure at least it's not greater than the streams in the list. Fixes: 7f9d68ac944e ("sctp: implement sender-side procedures for SSN Reset Request Parameter") Reported-by: Dmitry Vyukov Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index eb17a911aa29..3253f724a995 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3891,13 +3891,17 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, struct sctp_association *asoc; int retval = -EINVAL; - if (optlen < sizeof(struct sctp_reset_streams)) + if (optlen < sizeof(*params)) return -EINVAL; params = memdup_user(optval, optlen); if (IS_ERR(params)) return PTR_ERR(params); + if (params->srs_number_streams * sizeof(__u16) > + optlen - sizeof(*params)) + goto out; + asoc = sctp_id2assoc(sk, params->srs_assoc_id); if (!asoc) goto out; -- cgit v1.2.3 From d2950278d2d04ff5314abeb38d9c59c4e7c0ee53 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Dec 2017 18:23:09 +0100 Subject: xfrm: put policies when reusing pcpu xdst entry We need to put the policies when re-using the pcpu xdst entry, else this leaks the reference. Fixes: ec30d78c14a813db39a647b6a348b428 ("xfrm: add xdst pcpu cache") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 038ec68f6901..70aa5cb0c659 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1839,6 +1839,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, sizeof(struct xfrm_policy *) * num_pols) == 0 && xfrm_xdst_can_reuse(xdst, xfrm, err)) { dst_hold(&xdst->u.dst); + xfrm_pols_put(pols, num_pols); while (err > 0) xfrm_state_put(xfrm[--err]); return xdst; -- cgit v1.2.3 From 30791ac41927ebd3e75486f9504b6d2280463bf0 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 11 Dec 2017 00:05:46 -0800 Subject: tcp md5sig: Use skb's saddr when replying to an incoming segment The MD5-key that belongs to a connection is identified by the peer's IP-address. When we are in tcp_v4(6)_reqsk_send_ack(), we are replying to an incoming segment from tcp_check_req() that failed the seq-number checks. Thus, to find the correct key, we need to use the skb's saddr and not the daddr. This bug seems to have been there since quite a while, but probably got unnoticed because the consequences are not catastrophic. We will call tcp_v4_reqsk_send_ack only to send a challenge-ACK back to the peer, thus the connection doesn't really fail. Fixes: 9501f9722922 ("tcp md5sig: Let the caller pass appropriate key for tcp_v{4,6}_do_calc_md5_hash().") Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 77ea45da0fe9..94e28350f420 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -848,7 +848,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1f04ec0e4a7a..7178476b3d2f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -994,7 +994,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), 0, 0); } -- cgit v1.2.3 From b9b312a7a451e9c098921856e7cfbc201120e1a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Dec 2017 07:03:38 -0800 Subject: ipv6: mcast: better catch silly mtu values syzkaller reported crashes in IPv6 stack [1] Xin Long found that lo MTU was set to silly values. IPv6 stack reacts to changes to small MTU, by disabling itself under RTNL. But there is a window where threads not using RTNL can see a wrong device mtu. This can lead to surprises, in mld code where it is assumed the mtu is suitable. Fix this by reading device mtu once and checking IPv6 minimal MTU. [1] skbuff: skb_over_panic: text:0000000010b86b8d len:196 put:20 head:000000003b477e60 data:000000000e85441e tail:0xd4 end:0xc0 dev:lo ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:104! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc2-mm1+ #39 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_panic+0x15c/0x1f0 net/core/skbuff.c:100 RSP: 0018:ffff8801db307508 EFLAGS: 00010286 RAX: 0000000000000082 RBX: ffff8801c517e840 RCX: 0000000000000000 RDX: 0000000000000082 RSI: 1ffff1003b660e61 RDI: ffffed003b660e95 RBP: ffff8801db307570 R08: 1ffff1003b660e23 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff85bd4020 R13: ffffffff84754ed2 R14: 0000000000000014 R15: ffff8801c4e26540 FS: 0000000000000000(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000463610 CR3: 00000001c6698000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_over_panic net/core/skbuff.c:109 [inline] skb_put+0x181/0x1c0 net/core/skbuff.c:1694 add_grhead.isra.24+0x42/0x3b0 net/ipv6/mcast.c:1695 add_grec+0xa55/0x1060 net/ipv6/mcast.c:1817 mld_send_cr net/ipv6/mcast.c:1903 [inline] mld_ifc_timer_expire+0x4d2/0x770 net/ipv6/mcast.c:2448 call_timer_fn+0x23b/0x840 kernel/time/timer.c:1320 expire_timers kernel/time/timer.c:1357 [inline] __run_timers+0x7e1/0xb60 kernel/time/timer.c:1660 run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686 __do_softirq+0x29d/0xbb2 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1d3/0x210 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:540 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:920 Signed-off-by: Eric Dumazet Reported-by: syzbot Tested-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index fc6d7d143f2c..844642682b83 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1682,16 +1682,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, struct mld2_grec **ppgr) + int type, struct mld2_grec **ppgr, unsigned int mtu) { - struct net_device *dev = pmc->idev->dev; struct mld2_report *pmr; struct mld2_grec *pgr; - if (!skb) - skb = mld_newpack(pmc->idev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = mld_newpack(pmc->idev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct mld2_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -1714,10 +1714,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV6_MIN_MTU) + return skb; + isquery = type == MLD2_MODE_IS_INCLUDE || type == MLD2_MODE_IS_EXCLUDE; truncate = type == MLD2_MODE_IS_EXCLUDE || @@ -1738,7 +1743,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); } } first = 1; @@ -1774,12 +1779,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -1814,7 +1819,7 @@ empty_source: mld_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) -- cgit v1.2.3 From b5476022bbada3764609368f03329ca287528dc8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Dec 2017 07:17:39 -0800 Subject: ipv4: igmp: guard against silly MTU values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IPv4 stack reacts to changes to small MTU, by disabling itself under RTNL. But there is a window where threads not using RTNL can see a wrong device mtu. This can lead to surprises, in igmp code where it is assumed the mtu is suitable. Fix this by reading device mtu once and checking IPv4 minimal MTU. This patch adds missing IPV4_MIN_MTU define, to not abuse ETH_MIN_MTU anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- net/ipv4/igmp.c | 24 +++++++++++++++--------- net/ipv4/ip_tunnel.c | 4 ++-- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a4573bccd6da..7a93359fbc72 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1428,7 +1428,7 @@ skip: static bool inetdev_valid_mtu(unsigned int mtu) { - return mtu >= 68; + return mtu >= IPV4_MIN_MTU; } static void inetdev_send_gratuitous_arp(struct net_device *dev, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d1f8f302dbf3..50448a220a1f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -404,16 +404,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, - int type, struct igmpv3_grec **ppgr) + int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; - if (!skb) - skb = igmpv3_newpack(dev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = igmpv3_newpack(dev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -436,12 +437,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV4_MIN_MTU) + return skb; + isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || @@ -462,7 +468,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); } } first = 1; @@ -498,12 +504,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -538,7 +544,7 @@ empty_source: igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index fe6fee728ce4..5ddb1cb52bd4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; mtu -= (dev->hard_header_len + t_hlen); - if (mtu < 68) - mtu = 68; + if (mtu < IPV4_MIN_MTU) + mtu = IPV4_MIN_MTU; return mtu; } -- cgit v1.2.3 From 83593010d3b87601e775f240ce46c53ddf25828d Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Mon, 11 Dec 2017 22:09:46 +0530 Subject: net: remove duplicate includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 1 - net/dsa/slave.c | 1 - net/netfilter/nf_conntrack_netlink.c | 1 - net/sched/act_meta_mark.c | 1 - net/sched/act_meta_skbtcindex.c | 1 - net/sched/cls_api.c | 1 - net/sched/cls_u32.c | 1 - 7 files changed, 7 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 1c4810919a0a..b9057478d69c 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d6e7a642493b..a95a55f79137 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 59c08997bfdf..332b51870ed7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #ifdef CONFIG_NF_NAT_NEEDED #include diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c index 1e3f10e5da99..6445184b2759 100644 --- a/net/sched/act_meta_mark.c +++ b/net/sched/act_meta_mark.c @@ -22,7 +22,6 @@ #include #include #include -#include static int skbmark_encode(struct sk_buff *skb, void *skbdata, struct tcf_meta_info *e) diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c index 2ea1f26c9e96..7221437ca3a6 100644 --- a/net/sched/act_meta_skbtcindex.c +++ b/net/sched/act_meta_skbtcindex.c @@ -22,7 +22,6 @@ #include #include #include -#include static int skbtcindex_encode(struct sk_buff *skb, void *skbdata, struct tcf_meta_info *e) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ddcf04b4ab43..f40256a3e7f0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ac152b4f4247..507859cdd1cb 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -45,7 +45,6 @@ #include #include #include -#include #include struct tc_u_knode { -- cgit v1.2.3 From c545a945d0d9ea2ea2c7d23d43cf0d86e32cd7cf Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 11 Dec 2017 19:11:55 +0100 Subject: tipc: eliminate potential memory leak In the function tipc_sk_mcast_rcv() we call refcount_dec(&skb->users) on received sk_buffers. Since the reference counter might hit zero at this point, we have a potential memory leak. We fix this by replacing refcount_dec() with kfree_skb(). Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5d18c0caa92b..41127d0b925e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1140,7 +1140,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, __skb_dequeue(arrvq); __skb_queue_tail(inputq, skb); } - refcount_dec(&skb->users); + kfree_skb(skb); spin_unlock_bh(&inputq->lock); continue; } -- cgit v1.2.3 From a46182b00290839fa3fa159d54fd3237bd8669f0 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Mon, 11 Dec 2017 11:13:45 -0800 Subject: net: igmp: Use correct source address on IGMPv3 reports Closing a multicast socket after the final IPv4 address is deleted from an interface can generate a membership report that uses the source IP from a different interface. The following test script, run from an isolated netns, reproduces the issue: #!/bin/bash ip link add dummy0 type dummy ip link add dummy1 type dummy ip link set dummy0 up ip link set dummy1 up ip addr add 10.1.1.1/24 dev dummy0 ip addr add 192.168.99.99/24 dev dummy1 tcpdump -U -i dummy0 & socat EXEC:"sleep 2" \ UDP4-DATAGRAM:239.101.1.68:8889,ip-add-membership=239.0.1.68:10.1.1.1 & sleep 1 ip addr del 10.1.1.1/24 dev dummy0 sleep 5 kill %tcpdump RFC 3376 specifies that the report must be sent with a valid IP source address from the destination subnet, or from address 0.0.0.0. Add an extra check to make sure this is the case. Signed-off-by: Kevin Cernekee Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 50448a220a1f..726f6b608274 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include @@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } +/* source address selection per RFC 3376 section 4.2.13 */ +static __be32 igmpv3_get_srcaddr(struct net_device *dev, + const struct flowi4 *fl4) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (!in_dev) + return htonl(INADDR_ANY); + + for_ifa(in_dev) { + if (inet_ifa_match(fl4->saddr, ifa)) + return fl4->saddr; + } endfor_ifa(in_dev); + + return htonl(INADDR_ANY); +} + static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; @@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; - pip->saddr = fl4.saddr; + pip->saddr = igmpv3_get_srcaddr(dev, &fl4); pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); -- cgit v1.2.3 From 9ee11bd03cb1a5c3ca33c2bb70e7ed325f68890f Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 12 Dec 2017 16:28:58 -0800 Subject: tcp: fix potential underestimation on rcv_rtt When ms timestamp is used, current logic uses 1us in tcp_rcv_rtt_update() when the real rcv_rtt is within 1 - 999us. This could cause rcv_rtt underestimation. Fix it by always using a min value of 1ms if ms timestamp is used. Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9550cc42de2d..45f750e85714 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -508,9 +508,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) u32 new_sample = tp->rcv_rtt_est.rtt_us; long m = sample; - if (m == 0) - m = 1; - if (new_sample != 0) { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially @@ -547,6 +544,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); + if (!delta_us) + delta_us = 1; tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: @@ -563,8 +562,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + u32 delta_us; + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); tcp_rcv_rtt_update(tp, delta_us, 0); } } -- cgit v1.2.3 From 4688eb7cf3ae2c2721d1dacff5c1384cba47d176 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Dec 2017 18:22:52 -0800 Subject: tcp: refresh tcp_mstamp from timers callbacks Only the retransmit timer currently refreshes tcp_mstamp We should do the same for delayed acks and keepalives. Even if RFC 7323 does not request it, this is consistent to what linux did in the past, when TS values were based on jiffies. Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path") Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Mike Maloney Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Mike Maloney Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 16df6dd44b98..968fda198376 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } + tcp_mstamp_refresh(tcp_sk(sk)); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } @@ -632,6 +633,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } + tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; -- cgit v1.2.3 From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/auth_gss/gss_rpc_xdr.c | 1 + net/sunrpc/auth_gss/svcauth_gss.c | 1 + net/sunrpc/svcauth_unix.c | 2 ++ 3 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index c4778cae58ef..444380f968f1 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -231,6 +231,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, goto out_free_groups; creds->cr_group_info->gid[i] = kgid; } + groups_sort(creds->cr_group_info); return 0; out_free_groups: diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 5dd4e6c9fef2..26531193fce4 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -481,6 +481,7 @@ static int rsc_parse(struct cache_detail *cd, goto out; rsci.cred.cr_group_info->gid[i] = kgid; } + groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 740b67d5a733..af7f28fb8102 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -520,6 +520,7 @@ static int unix_gid_parse(struct cache_detail *cd, ug.gi->gid[i] = kgid; } + groups_sort(ug.gi); ugp = unix_gid_lookup(cd, uid); if (ugp) { struct cache_head *ch; @@ -819,6 +820,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); cred->cr_group_info->gid[i] = kgid; } + groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; return SVC_DENIED; -- cgit v1.2.3 From 2d17d8d79e77ff3f1b35b87522fc72fa562260ff Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 14 Dec 2017 17:17:56 -0800 Subject: xdp: linearize skb in netif_receive_generic_xdp() In netif_receive_generic_xdp(), it is necessary to linearize all nonlinear skb. However, in current implementation, skb with troom <= 0 are not linearized. This patch fixes this by calling skb_linearize() for all nonlinear skb. Fixes: de8f3a83b0a0 ("bpf: add meta pointer for direct access") Signed-off-by: Song Liu Acked-by: Martin KaFai Lau Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index f47e96b62308..01ee854454a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3904,7 +3904,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) goto do_drop; - if (troom > 0 && __skb_linearize(skb)) + if (skb_linearize(skb)) goto do_drop; } -- cgit v1.2.3 From 35b99dffc3f710cafceee6c8c6ac6a98eb2cb4bf Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 13 Dec 2017 14:41:06 -0500 Subject: sock: free skb in skb_complete_tx_timestamp on error skb_complete_tx_timestamp must ingest the skb it is passed. Call kfree_skb if the skb cannot be enqueued. Fixes: b245be1f4db1 ("net-timestamp: no-payload only sysctl") Fixes: 9ac25fc06375 ("net: fix socket refcounting in skb_complete_tx_timestamp()") Reported-by: Richard Cochran Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6b0ff396fa9d..a592ca025fc4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4293,7 +4293,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk = skb->sk; if (!skb_may_tx_timestamp(sk, false)) - return; + goto err; /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. @@ -4302,7 +4302,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, *skb_hwtstamps(skb) = *hwtstamps; __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); + return; } + +err: + kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); -- cgit v1.2.3 From 7a4fa29106d9a38ef005f5ab15d493c259f269c0 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:29 +0200 Subject: net: sched: Add TCA_HW_OFFLOAD Qdiscs can be offloaded to HW, but current implementation isn't uniform. Instead, qdiscs either pass information about offload status via their TCA_OPTIONS or omit it altogether. Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform uAPI for the offloading status of qdiscs. Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b6c4f536876b..0f1eab99ff4e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -795,6 +795,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, tcm->tcm_info = refcount_read(&q->refcnt); if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; + if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) + goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; qlen = q->q.qlen; -- cgit v1.2.3 From 428a68af3a7c3a3380ff1f750a24d213f370f89f Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:30 +0200 Subject: net: sched: Move to new offload indication in RED Let RED utilize the new internal flag, TCQ_F_OFFLOADED, to mark a given qdisc as offloaded instead of using a dedicated indication. Also, change internal logic into looking at said flag when possible. Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc") Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_red.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 9d874e60e032..f0747eb87dc4 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -157,6 +157,7 @@ static int red_offload(struct Qdisc *sch, bool enable) .handle = sch->handle, .parent = sch->parent, }; + int err; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; @@ -171,7 +172,14 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.command = TC_RED_DESTROY; } - return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt); + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt); + + if (!err && enable) + sch->flags |= TCQ_F_OFFLOADED; + else + sch->flags &= ~TCQ_F_OFFLOADED; + + return err; } static void red_destroy(struct Qdisc *sch) @@ -274,7 +282,7 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt) return red_change(sch, opt); } -static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) +static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt) { struct net_device *dev = qdisc_dev(sch); struct tc_red_qopt_offload hw_stats = { @@ -286,21 +294,12 @@ static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) .stats.qstats = &sch->qstats, }, }; - int err; - opt->flags &= ~TC_RED_OFFLOADED; - if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) - return 0; - - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, - &hw_stats); - if (err == -EOPNOTSUPP) + if (!(sch->flags & TCQ_F_OFFLOADED)) return 0; - if (!err) - opt->flags |= TC_RED_OFFLOADED; - - return err; + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, + &hw_stats); } static int red_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -319,7 +318,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) int err; sch->qstats.backlog = q->qdisc->qstats.backlog; - err = red_dump_offload(sch, &opt); + err = red_dump_offload_stats(sch, &opt); if (err) goto nla_put_failure; @@ -347,7 +346,7 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .marked = q->stats.prob_mark + q->stats.forced_mark, }; - if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) { + if (sch->flags & TCQ_F_OFFLOADED) { struct red_stats hw_stats = {0}; struct tc_red_qopt_offload hw_stats_request = { .command = TC_RED_XSTATS, -- cgit v1.2.3 From c05fad5713b81b049ec6ac4eb2d304030b1efdce Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Fri, 15 Dec 2017 10:46:16 +0800 Subject: ip_gre: fix wrong return value of erspan_rcv If pskb_may_pull return failed, return PACKET_REJECT instead of -ENOMEM. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Cc: William Tu Signed-off-by: Haishuang Yan Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index bb6239169b1a..9c1735632c8c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -266,7 +266,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, len = gre_hdr_len + sizeof(*ershdr); if (unlikely(!pskb_may_pull(skb, len))) - return -ENOMEM; + return PACKET_REJECT; iph = ip_hdr(skb); ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); -- cgit v1.2.3 From ccede7598588ae344143f82fb763912535648d58 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2017 14:04:04 -0500 Subject: xprtrdma: Spread reply processing over more CPUs Commit d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler directly from RECV completion") introduced a performance regression for NFS I/O small enough to not need memory registration. In multi- threaded benchmarks that generate primarily small I/O requests, IOPS throughput is reduced by nearly a third. This patch restores the previous level of throughput. Because workqueues are typically BOUND (in particular ib_comp_wq, nfsiod_workqueue, and rpciod_workqueue), NFS/RDMA workloads tend to aggregate on the CPU that is handling Receive completions. The usual approach to addressing this problem is to create a QP and CQ for each CPU, and then schedule transactions on the QP for the CPU where you want the transaction to complete. The transaction then does not require an extra context switch during completion to end up on the same CPU where the transaction was started. This approach doesn't work for the Linux NFS/RDMA client because currently the Linux NFS client does not support multiple connections per client-server pair, and the RDMA core API does not make it straightforward for ULPs to determine which CPU is responsible for handling Receive completions for a CQ. So for the moment, record the CPU number in the rpcrdma_req before the transport sends each RPC Call. Then during Receive completion, queue the RPC completion on that same CPU. Additionally, move all RPC completion processing to the deferred handler so that even RPCs with simple small replies complete on the CPU that sent the corresponding RPC Call. Fixes: d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler ...") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 6 +----- net/sunrpc/xprtrdma/transport.c | 2 ++ net/sunrpc/xprtrdma/verbs.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ed34dc0f144c..a3f2ab283aeb 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1408,11 +1408,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", __func__, rep, req, be32_to_cpu(rep->rr_xid)); - if (list_empty(&req->rl_registered) && - !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) - rpcrdma_complete_rqst(rep); - else - queue_work(rpcrdma_receive_wq, &rep->rr_work); + queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 646c24494ea7..6ee1ad8978f3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "xprt_rdma.h" @@ -656,6 +657,7 @@ xprt_rdma_allocate(struct rpc_task *task) task->tk_pid, __func__, rqst->rq_callsize, rqst->rq_rcvsize, req); + req->rl_cpu = smp_processor_id(); req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 710b3f77db82..8607c029c0dd 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -83,7 +83,7 @@ rpcrdma_alloc_wq(void) struct workqueue_struct *recv_wq; recv_wq = alloc_workqueue("xprtrdma_receive", - WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI, + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!recv_wq) return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 51686d9eac5f..1342f743f1c4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -342,6 +342,7 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; + int rl_cpu; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; -- cgit v1.2.3 From 90d91b0cd371193d9dbfa9beacab8ab9a4cb75e0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Dec 2017 21:24:08 -0500 Subject: SUNRPC: Fix a race in the receive code path We must ensure that the call to rpc_sleep_on() in xprt_transmit() cannot race with the call to xprt_complete_rqst(). Reported-by: Chuck Lever Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=317 Fixes: ce7c252a8c74 ("SUNRPC: Add a separate spinlock to protect..") Cc: stable@vger.kernel.org # 4.14+ Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 02a9bacb239b..5b06f6906a27 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1001,6 +1001,7 @@ void xprt_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + unsigned int connect_cookie; int status, numreqs; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); @@ -1024,6 +1025,7 @@ void xprt_transmit(struct rpc_task *task) } else if (!req->rq_bytes_sent) return; + connect_cookie = xprt->connect_cookie; req->rq_xtime = ktime_get(); status = xprt->ops->send_request(task); trace_xprt_transmit(xprt, req->rq_xid, status); @@ -1047,20 +1049,28 @@ void xprt_transmit(struct rpc_task *task) xprt->stat.bklog_u += xprt->backlog.qlen; xprt->stat.sending_u += xprt->sending.qlen; xprt->stat.pending_u += xprt->pending.qlen; + spin_unlock_bh(&xprt->transport_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else { + req->rq_connect_cookie = connect_cookie; + if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) { /* - * Sleep on the pending queue since - * we're expecting a reply. + * Sleep on the pending queue if we're expecting a reply. + * The spinlock ensures atomicity between the test of + * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). */ - if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) + spin_lock(&xprt->recv_lock); + if (!req->rq_reply_bytes_recvd) { rpc_sleep_on(&xprt->pending, task, xprt_timer); - req->rq_connect_cookie = xprt->connect_cookie; + /* + * Send an extra queue wakeup call if the + * connection was dropped in case the call to + * rpc_sleep_on() raced. + */ + if (!xprt_connected(xprt)) + xprt_wake_pending_tasks(xprt, -ENOTCONN); + } + spin_unlock(&xprt->recv_lock); } - spin_unlock_bh(&xprt->transport_lock); } static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) -- cgit v1.2.3 From 343723dd51ef1025a860e54df9472b5ba21ee3d9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 15 Dec 2017 12:40:12 +0100 Subject: net: sched: fix clsact init error path Since in qdisc_create, the destroy op is called when init fails, we don't do cleanup in init and leave it up to destroy. This fixes use-after-free when trying to put already freed block. Fixes: 6e40cf2d4dee ("net: sched: use extended variants of block_get/put in ingress and clsact qdiscs") Signed-off-by: Jiri Pirko Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 4 ++-- net/sched/sch_ingress.c | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f40256a3e7f0..b91ea03e3afa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -351,6 +351,8 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, { struct tcf_chain *chain; + if (!block) + return; /* Hold a refcnt for all chains, except 0, so that they don't disappear * while we are iterating. */ @@ -377,8 +379,6 @@ void tcf_block_put(struct tcf_block *block) { struct tcf_block_ext_info ei = {0, }; - if (!block) - return; tcf_block_put_ext(block, block->q, &ei); } diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5ecc38f35d47..5e1cd2e5df87 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -190,7 +190,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); if (err) - goto err_egress_block_get; + return err; net_inc_ingress_queue(); net_inc_egress_queue(); @@ -198,10 +198,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) sch->flags |= TCQ_F_CPUSTATS; return 0; - -err_egress_block_get: - tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); - return err; } static void clsact_destroy(struct Qdisc *sch) -- cgit v1.2.3 From b59e6979a86384e68b0ab6ffeab11f0034fba82d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 15 Dec 2017 12:40:13 +0100 Subject: net: sched: fix static key imbalance in case of ingress/clsact_init error Move static key increments to the beginning of the init function so they pair 1:1 with decrements in ingress/clsact_destroy, which is called in case ingress/clsact_init fails. Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: Jiri Pirko Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_ingress.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5e1cd2e5df87..fc1286f499c1 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -68,6 +68,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + net_inc_ingress_queue(); + mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; @@ -78,7 +80,6 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) if (err) return err; - net_inc_ingress_queue(); sch->flags |= TCQ_F_CPUSTATS; return 0; @@ -172,6 +173,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + net_inc_ingress_queue(); + net_inc_egress_queue(); + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; @@ -192,9 +196,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) if (err) return err; - net_inc_ingress_queue(); - net_inc_egress_queue(); - sch->flags |= TCQ_F_CPUSTATS; return 0; -- cgit v1.2.3 From 588753f1eb18978512b1c9b85fddb457d46f9033 Mon Sep 17 00:00:00 2001 From: Brendan McGrath Date: Wed, 13 Dec 2017 22:14:57 +1100 Subject: ipv6: icmp6: Allow icmp messages to be looped back One example of when an ICMPv6 packet is required to be looped back is when a host acts as both a Multicast Listener and a Multicast Router. A Multicast Router will listen on address ff02::16 for MLDv2 messages. Currently, MLDv2 messages originating from a Multicast Listener running on the same host as the Multicast Router are not being delivered to the Multicast Router. This is due to dst.input being assigned the default value of dst_discard. This results in the packet being looped back but discarded before being delivered to the Multicast Router. This patch sets dst.input to ip6_input to ensure a looped back packet is delivered to the Multicast Router. Signed-off-by: Brendan McGrath Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7a8d1500d374..2bc91c349273 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2336,6 +2336,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, } rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; -- cgit v1.2.3 From 234833991e14681f61cbfd93e65a5c976089cf11 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 18 Dec 2017 17:34:16 +0100 Subject: tipc: fix lost member events bug Group messages are not supposed to be returned to sender when the destination socket disappears. This is done correctly for regular traffic messages, by setting the 'dest_droppable' bit in the header. But we forget to do that in group protocol messages. This has the effect that such messages may sometimes bounce back to the sender, be perceived as a legitimate peer message, and wreak general havoc for the rest of the session. In particular, we have seen that a member in state LEAVING may go back to state RECLAIMED or REMITTED, hence causing suppression of an otherwise expected 'member down' event to the user. We fix this by setting the 'dest_droppable' bit even in group protocol messages. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 95fec2c057d6..efb5714e7a85 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -648,6 +648,7 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, } else if (mtyp == GRP_REMIT_MSG) { msg_set_grp_remitted(hdr, m->window); } + msg_set_dest_droppable(hdr, true); __skb_queue_tail(xmitq, skb); } -- cgit v1.2.3 From 3f42f5fe31c8715a34064bfd7b788488d1ea2f7c Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 18 Dec 2017 18:13:34 +0100 Subject: tipc: remove leaving group member from all lists A group member going into state LEAVING should never go back to any other state before it is finally deleted. However, this might happen if the socket needs to send out a RECLAIM message during this interval. Since we forget to remove the leaving member from the group's 'active' or 'pending' list, the member might be selected for reclaiming, change state to RECLAIMING, and get stuck in this state instead of being deleted. This might lead to suppression of the expected 'member down' event to the receiver. We fix this by removing the member from all lists, except the RB tree, at the moment it goes into state LEAVING. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index efb5714e7a85..b96ec429bb9b 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -699,6 +699,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); + list_del_init(&m->list); + list_del_init(&m->congested); + *usr_wakeup = true; /* Wait until WITHDRAW event is received */ if (m->state != MBR_LEAVING) { @@ -710,8 +713,6 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, ehdr = buf_msg(m->event_msg); msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); __skb_queue_tail(inputq, m->event_msg); - *usr_wakeup = true; - list_del_init(&m->congested); return; case GRP_ADV_MSG: if (!m) @@ -863,6 +864,7 @@ void tipc_group_member_evt(struct tipc_group *grp, msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt); __skb_queue_tail(inputq, skb); } + list_del_init(&m->list); list_del_init(&m->congested); } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); -- cgit v1.2.3 From 5c468674d17056148da06218d4da5d04baf22eac Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:07:25 +0800 Subject: sctp: fix the issue that a __u16 variable may overflow in sctp_ulpq_renege Now when reneging events in sctp_ulpq_renege(), the variable freed could be increased by a __u16 value twice while freed is of __u16 type. It means freed may overflow at the second addition. This patch is to fix it by using __u32 type for 'freed', while at it, also to remove 'if (chunk)' check, as all renege commands are generated in sctp_eat_data and it can't be NULL. Reported-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/ulpqueue.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index a71be33f3afe..e36ec5dd64c6 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -1084,29 +1084,21 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { - struct sctp_association *asoc; - __u16 needed, freed; - - asoc = ulpq->asoc; + struct sctp_association *asoc = ulpq->asoc; + __u32 freed = 0; + __u16 needed; - if (chunk) { - needed = ntohs(chunk->chunk_hdr->length); - needed -= sizeof(struct sctp_data_chunk); - } else - needed = SCTP_DEFAULT_MAXWINDOW; - - freed = 0; + needed = ntohs(chunk->chunk_hdr->length) - + sizeof(struct sctp_data_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_order(ulpq, needed); - if (freed < needed) { + if (freed < needed) freed += sctp_ulpq_renege_frags(ulpq, needed - freed); - } } /* If able to free enough room, accept this chunk. */ - if (chunk && (freed >= needed)) { - int retval; - retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); + if (freed >= needed) { + int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); /* * Enter partial delivery if chunk has not been * delivered; otherwise, drain the reassembly queue. -- cgit v1.2.3 From d196975905b2bb227dc54547c03b3d9d0013805c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:13:17 +0800 Subject: sctp: add SCTP_CID_RECONF conversion in sctp_cname Whenever a new type of chunk is added, the corresp conversion in sctp_cname should be added. Otherwise, in some places, pr_debug will print it as "unknown chunk". Fixes: cc16f00f6529 ("sctp: add support for generating stream reconf ssn reset request chunk") Signed-off-by: Xin Long Acked-by: Marcelo R. Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/debug.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 3f619fdcbf0a..291c97b07058 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -78,6 +78,9 @@ const char *sctp_cname(const union sctp_subtype cid) case SCTP_CID_AUTH: return "AUTH"; + case SCTP_CID_RECONF: + return "RECONF"; + default: break; } -- cgit v1.2.3 From 84aeb437ab98a2bce3d4b2111c79723aedfceb33 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 18 Dec 2017 17:35:09 +0200 Subject: net: bridge: fix early call to br_stp_change_bridge_id and plug newlink leaks The early call to br_stp_change_bridge_id in bridge's newlink can cause a memory leak if an error occurs during the newlink because the fdb entries are not cleaned up if a different lladdr was specified, also another minor issue is that it generates fdb notifications with ifindex = 0. Another unrelated memory leak is the bridge sysfs entries which get added on NETDEV_REGISTER event, but are not cleaned up in the newlink error path. To remove this special case the call to br_stp_change_bridge_id is done after netdev register and we cleanup the bridge on changelink error via br_dev_delete to plug all leaks. This patch makes netlink bridge destruction on newlink error the same as dellink and ioctl del which is necessary since at that point we have a fully initialized bridge device. To reproduce the issue: $ ip l add br0 address 00:11:22:33:44:55 type bridge group_fwd_mask 1 RTNETLINK answers: Invalid argument $ rmmod bridge [ 1822.142525] ============================================================================= [ 1822.143640] BUG bridge_fdb_cache (Tainted: G O ): Objects remaining in bridge_fdb_cache on __kmem_cache_shutdown() [ 1822.144821] ----------------------------------------------------------------------------- [ 1822.145990] Disabling lock debugging due to kernel taint [ 1822.146732] INFO: Slab 0x0000000092a844b2 objects=32 used=2 fp=0x00000000fef011b0 flags=0x1ffff8000000100 [ 1822.147700] CPU: 2 PID: 13584 Comm: rmmod Tainted: G B O 4.15.0-rc2+ #87 [ 1822.148578] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 1822.150008] Call Trace: [ 1822.150510] dump_stack+0x78/0xa9 [ 1822.151156] slab_err+0xb1/0xd3 [ 1822.151834] ? __kmalloc+0x1bb/0x1ce [ 1822.152546] __kmem_cache_shutdown+0x151/0x28b [ 1822.153395] shutdown_cache+0x13/0x144 [ 1822.154126] kmem_cache_destroy+0x1c0/0x1fb [ 1822.154669] SyS_delete_module+0x194/0x244 [ 1822.155199] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 1822.155773] entry_SYSCALL_64_fastpath+0x23/0x9a [ 1822.156343] RIP: 0033:0x7f929bd38b17 [ 1822.156859] RSP: 002b:00007ffd160e9a98 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0 [ 1822.157728] RAX: ffffffffffffffda RBX: 00005578316ba090 RCX: 00007f929bd38b17 [ 1822.158422] RDX: 00007f929bd9ec60 RSI: 0000000000000800 RDI: 00005578316ba0f0 [ 1822.159114] RBP: 0000000000000003 R08: 00007f929bff5f20 R09: 00007ffd160e8a11 [ 1822.159808] R10: 00007ffd160e9860 R11: 0000000000000202 R12: 00007ffd160e8a80 [ 1822.160513] R13: 0000000000000000 R14: 0000000000000000 R15: 00005578316ba090 [ 1822.161278] INFO: Object 0x000000007645de29 @offset=0 [ 1822.161666] INFO: Object 0x00000000d5df2ab5 @offset=128 Fixes: 30313a3d5794 ("bridge: Handle IFLA_ADDRESS correctly when creating bridge device") Fixes: 5b8d5429daa0 ("bridge: netlink: register netdevice before executing changelink") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d0ef0a8e8831..015f465c514b 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1262,19 +1262,20 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev, struct net_bridge *br = netdev_priv(dev); int err; + err = register_netdevice(dev); + if (err) + return err; + if (tb[IFLA_ADDRESS]) { spin_lock_bh(&br->lock); br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); spin_unlock_bh(&br->lock); } - err = register_netdevice(dev); - if (err) - return err; - err = br_changelink(dev, tb, data, extack); if (err) - unregister_netdevice(dev); + br_dev_delete(dev, NULL); + return err; } -- cgit v1.2.3 From acf568ee859f098279eadf551612f103afdacb4e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 15 Dec 2017 16:40:44 +1100 Subject: xfrm: Reinject transport-mode packets through tasklet This is an old bugbear of mine: https://www.mail-archive.com/netdev@vger.kernel.org/msg03894.html By crafting special packets, it is possible to cause recursion in our kernel when processing transport-mode packets at levels that are only limited by packet size. The easiest one is with DNAT, but an even worse one is where UDP encapsulation is used in which case you just have to insert an UDP encapsulation header in between each level of recursion. This patch avoids this problem by reinjecting tranport-mode packets through a tasklet. Fixes: b05e106698d9 ("[IPV4/6]: Netfilter IPsec input hooks") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_input.c | 12 ++++++++++- net/ipv6/xfrm6_input.c | 10 ++++++++- net/xfrm/xfrm_input.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index e50b7fea57ee..bcfc00e88756 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -23,6 +23,12 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } +static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return dst_input(skb); +} + static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -33,7 +39,11 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, iph->tos, skb->dev)) goto drop; } - return dst_input(skb); + + if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2)) + goto drop; + + return 0; drop: kfree_skb(skb); return NET_RX_DROP; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index fe04e23af986..841f4a07438e 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -32,6 +32,14 @@ int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, } EXPORT_SYMBOL(xfrm6_rcv_spi); +static int xfrm6_transport_finish2(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + if (xfrm_trans_queue(skb, ip6_rcv_finish)) + __kfree_skb(skb); + return -1; +} + int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); @@ -56,7 +64,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_rcv_finish); + xfrm6_transport_finish2); return -1; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index da6447389ffb..3f6f6f8c9fa5 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -8,15 +8,29 @@ * */ +#include +#include #include #include #include +#include #include #include #include #include #include +struct xfrm_trans_tasklet { + struct tasklet_struct tasklet; + struct sk_buff_head queue; +}; + +struct xfrm_trans_cb { + int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); +}; + +#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) + static struct kmem_cache *secpath_cachep __read_mostly; static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); @@ -25,6 +39,8 @@ static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; static struct gro_cells gro_cells; static struct net_device xfrm_napi_dev; +static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); + int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; @@ -477,9 +493,41 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr) } EXPORT_SYMBOL(xfrm_input_resume); +static void xfrm_trans_reinject(unsigned long data) +{ + struct xfrm_trans_tasklet *trans = (void *)data; + struct sk_buff_head queue; + struct sk_buff *skb; + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&trans->queue, &queue); + + while ((skb = __skb_dequeue(&queue))) + XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb); +} + +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) +{ + struct xfrm_trans_tasklet *trans; + + trans = this_cpu_ptr(&xfrm_trans_tasklet); + + if (skb_queue_len(&trans->queue) >= netdev_max_backlog) + return -ENOBUFS; + + XFRM_TRANS_SKB_CB(skb)->finish = finish; + skb_queue_tail(&trans->queue, skb); + tasklet_schedule(&trans->tasklet); + return 0; +} +EXPORT_SYMBOL(xfrm_trans_queue); + void __init xfrm_input_init(void) { int err; + int i; init_dummy_netdev(&xfrm_napi_dev); err = gro_cells_init(&gro_cells, &xfrm_napi_dev); @@ -490,4 +538,13 @@ void __init xfrm_input_init(void) sizeof(struct sec_path), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + + for_each_possible_cpu(i) { + struct xfrm_trans_tasklet *trans; + + trans = &per_cpu(xfrm_trans_tasklet, i); + __skb_queue_head_init(&trans->queue); + tasklet_init(&trans->tasklet, xfrm_trans_reinject, + (unsigned long)trans); + } } -- cgit v1.2.3 From 5d32407396b0433f9b738fcfcb9599bcba7379ae Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 14 Dec 2017 14:33:38 +0100 Subject: cfg80211: always rewrite generated files from scratch Currently the certs C code generation appends to the generated files, which is most likely a leftover from commit 715a12334764 ("wireless: don't write C files on failures"). This causes duplicate code in the generated files if the certificates have their timestamps modified between builds and thereby trigger the generation rules. Fixes: 715a12334764 ("wireless: don't write C files on failures") Signed-off-by: Thierry Reding Signed-off-by: Johannes Berg --- net/wireless/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index d7d6cb00c47b..b662be3422e1 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -43,7 +43,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) echo "$$allf"; \ echo '};'; \ echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ - ) >> $@) + ) > $@) $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) @@ -66,4 +66,4 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ echo "$$allf"; \ echo '};'; \ echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \ - ) >> $@) + ) > $@) -- cgit v1.2.3 From 04a7279ff12fc47b657f70731d401c0064f5838a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Dec 2017 09:26:17 +0100 Subject: cfg80211: ship certificates as hex files Not only does this remove the need for the hexdump code in most normal kernel builds (still there for the extra directory), but it also removes the need to ship binary files, which apparently is somewhat problematic, as Randy reported. While at it, also add the generated files to clean-files. Reported-by: Randy Dunlap Signed-off-by: Johannes Berg --- net/wireless/Makefile | 29 ++++--------- net/wireless/certs/sforshee.hex | 86 +++++++++++++++++++++++++++++++++++++++ net/wireless/certs/sforshee.x509 | Bin 680 -> 0 bytes 3 files changed, 95 insertions(+), 20 deletions(-) create mode 100644 net/wireless/certs/sforshee.hex delete mode 100644 net/wireless/certs/sforshee.x509 (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index b662be3422e1..1d84f91bbfb0 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -23,27 +23,14 @@ ifneq ($(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR),) cfg80211-y += extra-certs.o endif -$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) +$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) @$(kecho) " GEN $@" - @(set -e; \ - allf=""; \ - for f in $^ ; do \ - # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \ - thisf=$$(od -An -v -tx1 < $$f | \ - sed -e 's/ /\n/g' | \ - sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \ - sed -e 's/^/0x/;s/$$/,/'); \ - # file should not be empty - maybe command substitution failed? \ - test ! -z "$$thisf";\ - allf=$$allf$$thisf;\ - done; \ - ( \ - echo '#include "reg.h"'; \ - echo 'const u8 shipped_regdb_certs[] = {'; \ - echo "$$allf"; \ - echo '};'; \ - echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ - ) > $@) + @(echo '#include "reg.h"'; \ + echo 'const u8 shipped_regdb_certs[] = {'; \ + cat $^ ; \ + echo '};'; \ + echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ + ) > $@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) @@ -67,3 +54,5 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ echo '};'; \ echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \ ) > $@) + +clean-files += shipped-certs.c extra-certs.c diff --git a/net/wireless/certs/sforshee.hex b/net/wireless/certs/sforshee.hex new file mode 100644 index 000000000000..14ea66643ffa --- /dev/null +++ b/net/wireless/certs/sforshee.hex @@ -0,0 +1,86 @@ +/* Seth Forshee's regdb certificate */ +0x30, 0x82, 0x02, 0xa4, 0x30, 0x82, 0x01, 0x8c, +0x02, 0x09, 0x00, 0xb2, 0x8d, 0xdf, 0x47, 0xae, +0xf9, 0xce, 0xa7, 0x30, 0x0d, 0x06, 0x09, 0x2a, +0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0b, +0x05, 0x00, 0x30, 0x13, 0x31, 0x11, 0x30, 0x0f, +0x06, 0x03, 0x55, 0x04, 0x03, 0x0c, 0x08, 0x73, +0x66, 0x6f, 0x72, 0x73, 0x68, 0x65, 0x65, 0x30, +0x20, 0x17, 0x0d, 0x31, 0x37, 0x31, 0x30, 0x30, +0x36, 0x31, 0x39, 0x34, 0x30, 0x33, 0x35, 0x5a, +0x18, 0x0f, 0x32, 0x31, 0x31, 0x37, 0x30, 0x39, +0x31, 0x32, 0x31, 0x39, 0x34, 0x30, 0x33, 0x35, +0x5a, 0x30, 0x13, 0x31, 0x11, 0x30, 0x0f, 0x06, +0x03, 0x55, 0x04, 0x03, 0x0c, 0x08, 0x73, 0x66, +0x6f, 0x72, 0x73, 0x68, 0x65, 0x65, 0x30, 0x82, +0x01, 0x22, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, +0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x01, 0x05, +0x00, 0x03, 0x82, 0x01, 0x0f, 0x00, 0x30, 0x82, +0x01, 0x0a, 0x02, 0x82, 0x01, 0x01, 0x00, 0xb5, +0x40, 0xe3, 0x9c, 0x28, 0x84, 0x39, 0x03, 0xf2, +0x39, 0xd7, 0x66, 0x2c, 0x41, 0x38, 0x15, 0xac, +0x7e, 0xa5, 0x83, 0x71, 0x25, 0x7e, 0x90, 0x7c, +0x68, 0xdd, 0x6f, 0x3f, 0xd9, 0xd7, 0x59, 0x38, +0x9f, 0x7c, 0x6a, 0x52, 0xc2, 0x03, 0x2a, 0x2d, +0x7e, 0x66, 0xf4, 0x1e, 0xb3, 0x12, 0x70, 0x20, +0x5b, 0xd4, 0x97, 0x32, 0x3d, 0x71, 0x8b, 0x3b, +0x1b, 0x08, 0x17, 0x14, 0x6b, 0x61, 0xc4, 0x57, +0x8b, 0x96, 0x16, 0x1c, 0xfd, 0x24, 0xd5, 0x0b, +0x09, 0xf9, 0x68, 0x11, 0x84, 0xfb, 0xca, 0x51, +0x0c, 0xd1, 0x45, 0x19, 0xda, 0x10, 0x44, 0x8a, +0xd9, 0xfe, 0x76, 0xa9, 0xfd, 0x60, 0x2d, 0x18, +0x0b, 0x28, 0x95, 0xb2, 0x2d, 0xea, 0x88, 0x98, +0xb8, 0xd1, 0x56, 0x21, 0xf0, 0x53, 0x1f, 0xf1, +0x02, 0x6f, 0xe9, 0x46, 0x9b, 0x93, 0x5f, 0x28, +0x90, 0x0f, 0xac, 0x36, 0xfa, 0x68, 0x23, 0x71, +0x57, 0x56, 0xf6, 0xcc, 0xd3, 0xdf, 0x7d, 0x2a, +0xd9, 0x1b, 0x73, 0x45, 0xeb, 0xba, 0x27, 0x85, +0xef, 0x7a, 0x7f, 0xa5, 0xcb, 0x80, 0xc7, 0x30, +0x36, 0xd2, 0x53, 0xee, 0xec, 0xac, 0x1e, 0xe7, +0x31, 0xf1, 0x36, 0xa2, 0x9c, 0x63, 0xc6, 0x65, +0x5b, 0x7f, 0x25, 0x75, 0x68, 0xa1, 0xea, 0xd3, +0x7e, 0x00, 0x5c, 0x9a, 0x5e, 0xd8, 0x20, 0x18, +0x32, 0x77, 0x07, 0x29, 0x12, 0x66, 0x1e, 0x36, +0x73, 0xe7, 0x97, 0x04, 0x41, 0x37, 0xb1, 0xb1, +0x72, 0x2b, 0xf4, 0xa1, 0x29, 0x20, 0x7c, 0x96, +0x79, 0x0b, 0x2b, 0xd0, 0xd8, 0xde, 0xc8, 0x6c, +0x3f, 0x93, 0xfb, 0xc5, 0xee, 0x78, 0x52, 0x11, +0x15, 0x1b, 0x7a, 0xf6, 0xe2, 0x68, 0x99, 0xe7, +0xfb, 0x46, 0x16, 0x84, 0xe3, 0xc7, 0xa1, 0xe6, +0xe0, 0xd2, 0x46, 0xd5, 0xe1, 0xc4, 0x5f, 0xa0, +0x66, 0xf4, 0xda, 0xc4, 0xff, 0x95, 0x1d, 0x02, +0x03, 0x01, 0x00, 0x01, 0x30, 0x0d, 0x06, 0x09, +0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, +0x0b, 0x05, 0x00, 0x03, 0x82, 0x01, 0x01, 0x00, +0x87, 0x03, 0xda, 0xf2, 0x82, 0xc2, 0xdd, 0xaf, +0x7c, 0x44, 0x2f, 0x86, 0xd3, 0x5f, 0x4c, 0x93, +0x48, 0xb9, 0xfe, 0x07, 0x17, 0xbb, 0x21, 0xf7, +0x25, 0x23, 0x4e, 0xaa, 0x22, 0x0c, 0x16, 0xb9, +0x73, 0xae, 0x9d, 0x46, 0x7c, 0x75, 0xd9, 0xc3, +0x49, 0x57, 0x47, 0xbf, 0x33, 0xb7, 0x97, 0xec, +0xf5, 0x40, 0x75, 0xc0, 0x46, 0x22, 0xf0, 0xa0, +0x5d, 0x9c, 0x79, 0x13, 0xa1, 0xff, 0xb8, 0xa3, +0x2f, 0x7b, 0x8e, 0x06, 0x3f, 0xc8, 0xb6, 0xe4, +0x6a, 0x28, 0xf2, 0x34, 0x5c, 0x23, 0x3f, 0x32, +0xc0, 0xe6, 0xad, 0x0f, 0xac, 0xcf, 0x55, 0x74, +0x47, 0x73, 0xd3, 0x01, 0x85, 0xb7, 0x0b, 0x22, +0x56, 0x24, 0x7d, 0x9f, 0x09, 0xa9, 0x0e, 0x86, +0x9e, 0x37, 0x5b, 0x9c, 0x6d, 0x02, 0xd9, 0x8c, +0xc8, 0x50, 0x6a, 0xe2, 0x59, 0xf3, 0x16, 0x06, +0xea, 0xb2, 0x42, 0xb5, 0x58, 0xfe, 0xba, 0xd1, +0x81, 0x57, 0x1a, 0xef, 0xb2, 0x38, 0x88, 0x58, +0xf6, 0xaa, 0xc4, 0x2e, 0x8b, 0x5a, 0x27, 0xe4, +0xa5, 0xe8, 0xa4, 0xca, 0x67, 0x5c, 0xac, 0x72, +0x67, 0xc3, 0x6f, 0x13, 0xc3, 0x2d, 0x35, 0x79, +0xd7, 0x8a, 0xe7, 0xf5, 0xd4, 0x21, 0x30, 0x4a, +0xd5, 0xf6, 0xa3, 0xd9, 0x79, 0x56, 0xf2, 0x0f, +0x10, 0xf7, 0x7d, 0xd0, 0x51, 0x93, 0x2f, 0x47, +0xf8, 0x7d, 0x4b, 0x0a, 0x84, 0x55, 0x12, 0x0a, +0x7d, 0x4e, 0x3b, 0x1f, 0x2b, 0x2f, 0xfc, 0x28, +0xb3, 0x69, 0x34, 0xe1, 0x80, 0x80, 0xbb, 0xe2, +0xaf, 0xb9, 0xd6, 0x30, 0xf1, 0x1d, 0x54, 0x87, +0x23, 0x99, 0x9f, 0x51, 0x03, 0x4c, 0x45, 0x7d, +0x02, 0x65, 0x73, 0xab, 0xfd, 0xcf, 0x94, 0xcc, +0x0d, 0x3a, 0x60, 0xfd, 0x3c, 0x14, 0x2f, 0x16, +0x33, 0xa9, 0x21, 0x1f, 0xcb, 0x50, 0xb1, 0x8f, +0x03, 0xee, 0xa0, 0x66, 0xa9, 0x16, 0x79, 0x14, diff --git a/net/wireless/certs/sforshee.x509 b/net/wireless/certs/sforshee.x509 deleted file mode 100644 index c6f8f9d6b988..000000000000 Binary files a/net/wireless/certs/sforshee.x509 and /dev/null differ -- cgit v1.2.3 From cfddd4c33c254954927942599d299b3865743146 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:24:35 +0800 Subject: ip_gre: remove the incorrect mtu limit for ipgre tap ipgre tap driver calls ether_setup(), after commit 61e84623ace3 ("net: centralize net_device min/max MTU checking"), the range of mtu is [min_mtu, max_mtu], which is [68, 1500] by default. It causes the dev mtu of the ipgre tap device to not be greater than 1500, this limit value is not correct for ipgre tap device. Besides, it's .change_mtu already does the right check. So this patch is just to set max_mtu as 0, and leave the check to it's .change_mtu. Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9c1735632c8c..45ffd3d045d2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1310,6 +1310,7 @@ static const struct net_device_ops erspan_netdev_ops = { static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &gre_tap_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; -- cgit v1.2.3 From 2c52129a7d74d017320804c6928de770815c5f4a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:25:09 +0800 Subject: ip6_gre: remove the incorrect mtu limit for ipgre tap The same fix as the patch "ip_gre: remove the incorrect mtu limit for ipgre tap" is also needed for ip6_gre. Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4cfd8e0696fe..416c8913f132 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1308,6 +1308,7 @@ static void ip6gre_tap_setup(struct net_device *dev) ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &ip6gre_tap_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; -- cgit v1.2.3 From c9fefa08190fc879fb2e681035d7774e0a8c5170 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Dec 2017 14:26:21 +0800 Subject: ip6_tunnel: get the min mtu properly in ip6_tnl_xmit Now it's using IPV6_MIN_MTU as the min mtu in ip6_tnl_xmit, but IPV6_MIN_MTU actually only works when the inner packet is ipv6. With IPV6_MIN_MTU for ipv4 packets, the new pmtu for inner dst couldn't be set less than 1280. It would cause tx_err and the packet to be dropped when the outer dst pmtu is close to 1280. Jianlin found it by running ipv4 traffic with the topo: (client) gre6 <---> eth1 (route) eth2 <---> gre6 (server) After changing eth2 mtu to 1300, the performance became very low, or the connection was even broken. The issue also affects ip4ip6 and ip6ip6 tunnels. So if the inner packet is ipv4, 576 should be considered as the min mtu. Note that for ip4ip6 and ip6ip6 tunnels, the inner packet can only be ipv4 or ipv6, but for gre6 tunnel, it may also be ARP. This patch using 576 as the min mtu for non-ipv6 packet works for all those cases. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index db84f523656d..931c38f6ff4a 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1123,8 +1123,13 @@ route_lookup: max_headroom += 8; mtu -= 8; } - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + } else if (mtu < 576) { + mtu = 576; + } + if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { -- cgit v1.2.3 From 3db096011722fd8717e57687ae94b6917a11c9cc Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 18 Dec 2017 20:03:05 +0100 Subject: tipc: fix list sorting bug in function tipc_group_update_member() When, during a join operation, or during message transmission, a group member needs to be added to the group's 'congested' list, we sort it into the list in ascending order, according to its current advertised window size. However, we miss the case when the member is already on that list. This will have the result that the member, after the window size has been decremented, might be at the wrong position in that list. This again may have the effect that we during broadcast and multicast transmissions miss the fact that a destination is not yet ready for reception, and we end up sending anyway. From this point on, the behavior during the remaining session is unpredictable, e.g., with underflowing window sizes. We now correct this bug by unconditionally removing the member from the list before (re-)sorting it in. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index b96ec429bb9b..bbc004eaa31a 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -351,8 +351,7 @@ void tipc_group_update_member(struct tipc_member *m, int len) if (m->window >= ADV_IDLE) return; - if (!list_empty(&m->congested)) - return; + list_del_init(&m->congested); /* Sort member into congested members' list */ list_for_each_entry_safe(_m, tmp, &grp->congested, congested) { -- cgit v1.2.3 From d03a45572efa068fa64db211d6d45222660e76c5 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 19 Dec 2017 15:17:13 +0100 Subject: ipv4: fib: Fix metrics match when deleting a route The recently added fib_metrics_match() causes a regression for routes with both RTAX_FEATURES and RTAX_CC_ALGO if the latter has TCP_CONG_NEEDS_ECN flag set: | # ip link add d0 type dummy | # ip link set d0 up | # ip route add 172.29.29.0/24 dev d0 features ecn congctl dctcp | # ip route del 172.29.29.0/24 dev d0 features ecn congctl dctcp | RTNETLINK answers: No such process During route insertion, fib_convert_metrics() detects that the given CC algo requires ECN and hence sets DST_FEATURE_ECN_CA bit in RTAX_FEATURES. During route deletion though, fib_metrics_match() compares stored RTAX_FEATURES value with that from userspace (which obviously has no knowledge about DST_FEATURE_ECN_CA) and fails. Fixes: 5f9ae3d9e7e4a ("ipv4: do metrics match when looking up and deleting a route") Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f04d944f8abe..c586597da20d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -698,7 +698,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); - u32 val; + u32 fi_val, val; if (!type) continue; @@ -715,7 +715,11 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) val = nla_get_u32(nla); } - if (fi->fib_metrics->metrics[type - 1] != val) + fi_val = fi->fib_metrics->metrics[type - 1]; + if (type == RTAX_FEATURES) + fi_val &= ~DST_FEATURE_ECN_CA; + + if (fi_val != val) return false; } -- cgit v1.2.3 From 21b5944350052d2583e82dd59b19a9ba94a007f0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 19 Dec 2017 11:27:56 -0600 Subject: net: Fix double free and memory corruption in get_net_ns_by_id() (I can trivially verify that that idr_remove in cleanup_net happens after the network namespace count has dropped to zero --EWB) Function get_net_ns_by_id() does not check for net::count after it has found a peer in netns_ids idr. It may dereference a peer, after its count has already been finaly decremented. This leads to double free and memory corruption: put_net(peer) rtnl_lock() atomic_dec_and_test(&peer->count) [count=0] ... __put_net(peer) get_net_ns_by_id(net, id) spin_lock(&cleanup_list_lock) list_add(&net->cleanup_list, &cleanup_list) spin_unlock(&cleanup_list_lock) queue_work() peer = idr_find(&net->netns_ids, id) | get_net(peer) [count=1] | ... | (use after final put) v ... cleanup_net() ... spin_lock(&cleanup_list_lock) ... list_replace_init(&cleanup_list, ..) ... spin_unlock(&cleanup_list_lock) ... ... ... ... put_net(peer) ... atomic_dec_and_test(&peer->count) [count=0] ... spin_lock(&cleanup_list_lock) ... list_add(&net->cleanup_list, &cleanup_list) ... spin_unlock(&cleanup_list_lock) ... queue_work() ... rtnl_unlock() rtnl_lock() ... for_each_net(tmp) { ... id = __peernet2id(tmp, peer) ... spin_lock_irq(&tmp->nsid_lock) ... idr_remove(&tmp->netns_ids, id) ... ... ... net_drop_ns() ... net_free(peer) ... } ... | v cleanup_net() ... (Second free of peer) Also, put_net() on the right cpu may reorder with left's cpu list_replace_init(&cleanup_list, ..), and then cleanup_list will be corrupted. Since cleanup_net() is executed in worker thread, while put_net(peer) can happen everywhere, there should be enough time for concurrent get_net_ns_by_id() to pick the peer up, and the race does not seem to be unlikely. The patch fixes the problem in standard way. (Also, there is possible problem in peernet2id_alloc(), which requires check for net::count under nsid_lock and maybe_get_net(peer), but in current stable kernel it's used under rtnl_lock() and it has to be safe. Openswitch begun to use peernet2id_alloc(), and possibly it should be fixed too. While this is not in stable kernel yet, so I'll send a separate message to netdev@ later). Cc: Nicolas Dichtel Signed-off-by: Kirill Tkhai Fixes: 0c7aecd4bde4 "netns: add rtnl cmd to add and get peer netns ids" Reviewed-by: Andrey Ryabinin Reviewed-by: "Eric W. Biederman" Signed-off-by: Eric W. Biederman Reviewed-by: Eric Dumazet Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b797832565d3..60a71be75aea 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -267,7 +267,7 @@ struct net *get_net_ns_by_id(struct net *net, int id) spin_lock_bh(&net->nsid_lock); peer = idr_find(&net->netns_ids, id); if (peer) - get_net(peer); + peer = maybe_get_net(peer); spin_unlock_bh(&net->nsid_lock); rcu_read_unlock(); -- cgit v1.2.3 From 102740bd9436a3a6ba129af3a48271d794009fa5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Dec 2017 13:32:13 -0800 Subject: cls_bpf: fix offload assumptions after callback conversion cls_bpf used to take care of tracking what offload state a filter is in, i.e. it would track if offload request succeeded or not. This information would then be used to issue correct requests to the driver, e.g. requests for statistics only on offloaded filters, removing only filters which were offloaded, using add instead of replace if previous filter was not added etc. This tracking of offload state no longer functions with the new callback infrastructure. There could be multiple entities trying to offload the same filter. Throw out all the tracking and corresponding commands and simply pass to the drivers both old and new bpf program. Drivers will have to deal with offload state tracking by themselves. Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload") Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 93 ++++++++++++++++++++++------------------------------- 1 file changed, 38 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6fe798c2df1a..8d78e7f4ecc3 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -42,7 +42,6 @@ struct cls_bpf_prog { struct list_head link; struct tcf_result res; bool exts_integrated; - bool offloaded; u32 gen_flags; struct tcf_exts exts; u32 handle; @@ -148,33 +147,37 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) } static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, - enum tc_clsbpf_command cmd) + struct cls_bpf_prog *oldprog) { - bool addorrep = cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE; struct tcf_block *block = tp->chain->block; - bool skip_sw = tc_skip_sw(prog->gen_flags); struct tc_cls_bpf_offload cls_bpf = {}; + struct cls_bpf_prog *obj; + bool skip_sw; int err; + skip_sw = prog && tc_skip_sw(prog->gen_flags); + obj = prog ?: oldprog; + tc_cls_common_offload_init(&cls_bpf.common, tp); - cls_bpf.command = cmd; - cls_bpf.exts = &prog->exts; - cls_bpf.prog = prog->filter; - cls_bpf.name = prog->bpf_name; - cls_bpf.exts_integrated = prog->exts_integrated; - cls_bpf.gen_flags = prog->gen_flags; + cls_bpf.command = TC_CLSBPF_OFFLOAD; + cls_bpf.exts = &obj->exts; + cls_bpf.prog = prog ? prog->filter : NULL; + cls_bpf.oldprog = oldprog ? oldprog->filter : NULL; + cls_bpf.name = obj->bpf_name; + cls_bpf.exts_integrated = obj->exts_integrated; + cls_bpf.gen_flags = obj->gen_flags; err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); - if (addorrep) { + if (prog) { if (err < 0) { - cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); + cls_bpf_offload_cmd(tp, oldprog, prog); return err; } else if (err > 0) { prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; } } - if (addorrep && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) + if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; @@ -183,38 +186,17 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog) { - struct cls_bpf_prog *obj = prog; - enum tc_clsbpf_command cmd; - bool skip_sw; - int ret; - - skip_sw = tc_skip_sw(prog->gen_flags) || - (oldprog && tc_skip_sw(oldprog->gen_flags)); - - if (oldprog && oldprog->offloaded) { - if (!tc_skip_hw(prog->gen_flags)) { - cmd = TC_CLSBPF_REPLACE; - } else if (!tc_skip_sw(prog->gen_flags)) { - obj = oldprog; - cmd = TC_CLSBPF_DESTROY; - } else { - return -EINVAL; - } - } else { - if (tc_skip_hw(prog->gen_flags)) - return skip_sw ? -EINVAL : 0; - cmd = TC_CLSBPF_ADD; - } - - ret = cls_bpf_offload_cmd(tp, obj, cmd); - if (ret) - return ret; + if (prog && oldprog && prog->gen_flags != oldprog->gen_flags) + return -EINVAL; - obj->offloaded = true; - if (oldprog) - oldprog->offloaded = false; + if (prog && tc_skip_hw(prog->gen_flags)) + prog = NULL; + if (oldprog && tc_skip_hw(oldprog->gen_flags)) + oldprog = NULL; + if (!prog && !oldprog) + return 0; - return 0; + return cls_bpf_offload_cmd(tp, prog, oldprog); } static void cls_bpf_stop_offload(struct tcf_proto *tp, @@ -222,25 +204,26 @@ static void cls_bpf_stop_offload(struct tcf_proto *tp, { int err; - if (!prog->offloaded) - return; - - err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); - if (err) { + err = cls_bpf_offload_cmd(tp, NULL, prog); + if (err) pr_err("Stopping hardware offload failed: %d\n", err); - return; - } - - prog->offloaded = false; } static void cls_bpf_offload_update_stats(struct tcf_proto *tp, struct cls_bpf_prog *prog) { - if (!prog->offloaded) - return; + struct tcf_block *block = tp->chain->block; + struct tc_cls_bpf_offload cls_bpf = {}; + + tc_cls_common_offload_init(&cls_bpf.common, tp); + cls_bpf.command = TC_CLSBPF_STATS; + cls_bpf.exts = &prog->exts; + cls_bpf.prog = prog->filter; + cls_bpf.name = prog->bpf_name; + cls_bpf.exts_integrated = prog->exts_integrated; + cls_bpf.gen_flags = prog->gen_flags; - cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS); + tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false); } static int cls_bpf_init(struct tcf_proto *tp) -- cgit v1.2.3 From bb25c3855a12cc58e33cd7ee9b69943790fe35f7 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 20 Dec 2017 11:03:15 +0100 Subject: tipc: remove joining group member from congested list When we receive a JOIN message from a peer member, the message may contain an advertised window value ADV_IDLE that permits removing the member in question from the tipc_group::congested list. However, since the removal has been made conditional on that the advertised window is *not* ADV_IDLE, we miss this case. This has the effect that a sender sometimes may enter a state of permanent, false, broadcast congestion. We fix this by unconditinally removing the member from the congested list before calling tipc_member_update(), which might potentially sort it into the list again. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index bbc004eaa31a..7ebbdeb2a90e 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -689,10 +689,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); __skb_queue_tail(inputq, m->event_msg); } - if (m->window < ADV_IDLE) - tipc_group_update_member(m, 0); - else - list_del_init(&m->congested); + list_del_init(&m->congested); + tipc_group_update_member(m, 0); return; case GRP_LEAVE_MSG: if (!m) -- cgit v1.2.3 From b4681c2829e24943aadd1a7bb3a30d41d0a20050 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 20 Dec 2017 19:34:19 +0200 Subject: ipv4: Fix use-after-free when flushing FIB tables Since commit 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") the local table uses the same trie allocated for the main table when custom rules are not in use. When a net namespace is dismantled, the main table is flushed and freed (via an RCU callback) before the local table. In case the callback is invoked before the local table is iterated, a use-after-free can occur. Fix this by iterating over the FIB tables in reverse order, so that the main table is always freed after the local table. v3: Reworded comment according to Alex's suggestion. v2: Add a comment to make the fix more explicit per Dave's and Alex's feedback. Fixes: 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") Signed-off-by: Ido Schimmel Reported-by: Fengguang Wu Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f52d27a422c3..08259d078b1c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1298,14 +1298,19 @@ err_table_hash_alloc: static void ip_fib_net_exit(struct net *net) { - unsigned int i; + int i; rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif - for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + /* Destroy the tables in reverse order to guarantee that the + * local table, ID 255, is destroyed before the main table, ID + * 254. This is necessary as the local table may contain + * references to data contained in the main table. + */ + for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; -- cgit v1.2.3 From 58acfd714e6b02e8617448b431c2b64a2f1f0792 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 20 Dec 2017 12:28:25 +0200 Subject: ipv6: Honor specified parameters in fibmatch lookup Currently, parameters such as oif and source address are not taken into account during fibmatch lookup. Example (IPv4 for reference) before patch: $ ip -4 route show 192.0.2.0/24 dev dummy0 proto kernel scope link src 192.0.2.1 198.51.100.0/24 dev dummy1 proto kernel scope link src 198.51.100.1 $ ip -6 route show 2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium 2001:db8:2::/64 dev dummy1 proto kernel metric 256 pref medium fe80::/64 dev dummy0 proto kernel metric 256 pref medium fe80::/64 dev dummy1 proto kernel metric 256 pref medium $ ip -4 route get fibmatch 192.0.2.2 oif dummy0 192.0.2.0/24 dev dummy0 proto kernel scope link src 192.0.2.1 $ ip -4 route get fibmatch 192.0.2.2 oif dummy1 RTNETLINK answers: No route to host $ ip -6 route get fibmatch 2001:db8:1::2 oif dummy0 2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium $ ip -6 route get fibmatch 2001:db8:1::2 oif dummy1 2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium After: $ ip -6 route get fibmatch 2001:db8:1::2 oif dummy0 2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium $ ip -6 route get fibmatch 2001:db8:1::2 oif dummy1 RTNETLINK answers: Network is unreachable The problem stems from the fact that the necessary route lookup flags are not set based on these parameters. Instead of duplicating the same logic for fibmatch, we can simply resolve the original route from its copy and dump it instead. Fixes: 18c3a61c4264 ("net: ipv6: RTM_GETROUTE: return matched fib result when requested") Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2bc91c349273..0458b761f3c5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4298,19 +4298,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - if (!fibmatch) - dst = ip6_route_input_lookup(net, dev, &fl6, flags); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_input_lookup(net, dev, &fl6, flags); rcu_read_unlock(); } else { fl6.flowi6_oif = oif; - if (!fibmatch) - dst = ip6_route_output(net, NULL, &fl6); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_output(net, NULL, &fl6); } @@ -4327,6 +4321,15 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + if (fibmatch && rt->dst.from) { + struct rt6_info *ort = container_of(rt->dst.from, + struct rt6_info, dst); + + dst_hold(&ort->dst); + ip6_rt_put(rt); + rt = ort; + } + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); -- cgit v1.2.3 From c48e74736fccf25fb32bb015426359e1c2016e3b Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 20 Dec 2017 15:09:22 -0500 Subject: openvswitch: Fix pop_vlan action for double tagged frames skb_vlan_pop() expects skb->protocol to be a valid TPID for double tagged frames. So set skb->protocol to the TPID and let skb_vlan_pop() shift the true ethertype into position for us. Fixes: 5108bbaddc37 ("openvswitch: add processing of L3 packets") Signed-off-by: Eric Garver Reviewed-by: Jiri Benc Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index dbe2379329c5..f039064ce922 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -579,6 +579,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return -EINVAL; skb_reset_network_header(skb); + key->eth.type = skb->protocol; } else { eth = eth_hdr(skb); ether_addr_copy(key->eth.src, eth->h_source); @@ -592,15 +593,23 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (unlikely(parse_vlan(skb, key))) return -ENOMEM; - skb->protocol = parse_ethertype(skb); - if (unlikely(skb->protocol == htons(0))) + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) return -ENOMEM; + /* Multiple tagged packets need to retain TPID to satisfy + * skb_vlan_pop(), which will later shift the ethertype into + * skb->protocol. + */ + if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT)) + skb->protocol = key->eth.cvlan.tpid; + else + skb->protocol = key->eth.type; + skb_reset_network_header(skb); __skb_push(skb, skb->data - skb_mac_header(skb)); } skb_reset_mac_len(skb); - key->eth.type = skb->protocol; /* Network layer. */ if (key->eth.type == htons(ETH_P_IP)) { -- cgit v1.2.3 From 513674b5a2c9c7a67501506419da5c3c77ac6f08 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Dec 2017 12:10:21 -0800 Subject: net: reevalulate autoflowlabel setting after sysctl setting sysctl.ip6.auto_flowlabels is default 1. In our hosts, we set it to 2. If sockopt doesn't set autoflowlabel, outcome packets from the hosts are supposed to not include flowlabel. This is true for normal packet, but not for reset packet. The reason is ipv6_pinfo.autoflowlabel is set in sock creation. Later if we change sysctl.ip6.auto_flowlabels, the ipv6_pinfo.autoflowlabel isn't changed, so the sock will keep the old behavior in terms of auto flowlabel. Reset packet is suffering from this problem, because reset packet is sent from a special control socket, which is created at boot time. Since sysctl.ipv6.auto_flowlabels is 1 by default, the control socket will always have its ipv6_pinfo.autoflowlabel set, even after user set sysctl.ipv6.auto_flowlabels to 1, so reset packset will always have flowlabel. Normal sock created before sysctl setting suffers from the same issue. We can't even turn off autoflowlabel unless we kill all socks in the hosts. To fix this, if IPV6_AUTOFLOWLABEL sockopt is used, we use the autoflowlabel setting from user, otherwise we always call ip6_default_np_autolabel() which has the new settings of sysctl. Note, this changes behavior a little bit. Before commit 42240901f7c4 (ipv6: Implement different admin modes for automatic flow labels), the autoflowlabel behavior of a sock isn't sticky, eg, if sysctl changes, existing connection will change autoflowlabel behavior. After that commit, autoflowlabel behavior is sticky in the whole life of the sock. With this patch, the behavior isn't sticky again. Cc: Martin KaFai Lau Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: Shaohua Li Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 1 - net/ipv6/ip6_output.c | 12 ++++++++++-- net/ipv6/ipv6_sockglue.c | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c26f71234b9c..c9441ca45399 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -210,7 +210,6 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->autoflowlabel = ip6_default_np_autolabel(net); np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5110a418cc4d..f7dd51c42314 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -166,6 +166,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } +static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +{ + if (!np->autoflowlabel_set) + return ip6_default_np_autolabel(net); + else + return np->autoflowlabel; +} + /* * xmit an sk_buff (used by TCP, SCTP and DCCP) * Note : socket lock is not held for SYNACK packets, but might be modified @@ -230,7 +238,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -1626,7 +1634,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b9404feabd78..2d4680e0376f 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -886,6 +886,7 @@ pref_skip_coa: break; case IPV6_AUTOFLOWLABEL: np->autoflowlabel = valbool; + np->autoflowlabel_set = 1; retv = 0; break; case IPV6_RECVFRAGSIZE: -- cgit v1.2.3 From 268b790679422a89e9ab0685d9f291edae780c98 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 20 Dec 2017 17:37:49 -0500 Subject: skbuff: orphan frags before zerocopy clone Call skb_zerocopy_clone after skb_orphan_frags, to avoid duplicate calls to skb_uarg(skb)->callback for the same data. skb_zerocopy_clone associates skb_shinfo(skb)->uarg from frag_skb with each segment. This is only safe for uargs that do refcounting, which is those that pass skb_orphan_frags without dropping their shared frags. For others, skb_orphan_frags drops the user frags and sets the uarg to NULL, after which sock_zerocopy_clone has no effect. Qemu hangs were reported due to duplicate vhost_net_zerocopy_callback calls for the same data causing the vhost_net_ubuf_ref_>refcount to drop below zero. Link: http://lkml.kernel.org/r/ Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") Reported-by: Andreas Hartmann Reported-by: David Hill Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a592ca025fc4..edf40ac0cd07 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3654,8 +3654,6 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; - if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC)) - goto err; while (pos < offset + len) { if (i >= nfrags) { @@ -3681,6 +3679,8 @@ normal: if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) goto err; + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + goto err; *nskb_frag = *frag; __skb_frag_ref(nskb_frag); -- cgit v1.2.3 From b90ddd568792bcb0054eaf0f61785c8f80c3bd1c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 20 Dec 2017 17:37:50 -0500 Subject: skbuff: skb_copy_ubufs must release uarg even without user frags skb_copy_ubufs creates a private copy of frags[] to release its hold on user frags, then calls uarg->callback to notify the owner. Call uarg->callback even when no frags exist. This edge case can happen when zerocopy_sg_from_iter finds enough room in skb_headlen to copy all the data. Fixes: 3ece782693c4 ("sock: skb_copy_ubufs support for compound pages") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index edf40ac0cd07..a3cb0be4c6f3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1178,7 +1178,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) u32 d_off; if (!num_frags) - return 0; + goto release; if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; @@ -1238,6 +1238,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); skb_shinfo(skb)->nr_frags = new_frags; +release: skb_zcopy_clear(skb, false); return 0; } -- cgit v1.2.3 From e5a9336adb317db55eb3fe8200856096f3c71109 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 20 Dec 2017 19:36:03 +0300 Subject: ip6_gre: fix device features for ioctl setup When ip6gre is created using ioctl, its features, such as scatter-gather, GSO and tx-checksumming will be turned off: # ip -f inet6 tunnel add gre6 mode ip6gre remote fd00::1 # ethtool -k gre6 (truncated output) tx-checksumming: off scatter-gather: off tcp-segmentation-offload: off generic-segmentation-offload: off [requested on] But when netlink is used, they will be enabled: # ip link add gre6 type ip6gre remote fd00::1 # ethtool -k gre6 (truncated output) tx-checksumming: on scatter-gather: on tcp-segmentation-offload: on generic-segmentation-offload: on This results in a loss of performance when gre6 is created via ioctl. The issue was found with LTP/gre tests. Fix it by moving the setup of device features to a separate function and invoke it with ndo_init callback because both netlink and ioctl will eventually call it via register_netdevice(): register_netdevice() - ndo_init() callback -> ip6gre_tunnel_init() or ip6gre_tap_init() - ip6gre_tunnel_init_common() - ip6gre_tnl_init_features() The moved code also contains two minor style fixes: * removed needless tab from GRE6_FEATURES on NETIF_F_HIGHDMA line. * fixed the issue reported by checkpatch: "Unnecessary parentheses around 'nt->encap.type == TUNNEL_ENCAP_NONE'" Fixes: ac4eb009e477 ("ip6gre: Add support for basic offloads offloads excluding GSO") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 57 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 416c8913f132..772695960890 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1014,6 +1014,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev) eth_random_addr(dev->perm_addr); } +#define GRE6_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_HW_CSUM) + +static void ip6gre_tnl_init_features(struct net_device *dev) +{ + struct ip6_tnl *nt = netdev_priv(dev); + + dev->features |= GRE6_FEATURES; + dev->hw_features |= GRE6_FEATURES; + + if (!(nt->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(nt->parms.o_flags & TUNNEL_CSUM) || + nt->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + + /* Can use a lockless transmit, unless we generate + * output sequences + */ + dev->features |= NETIF_F_LLTX; + } +} + static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1048,6 +1078,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; + ip6gre_tnl_init_features(dev); + return 0; } @@ -1298,11 +1330,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_get_iflink = ip6_tnl_get_iflink, }; -#define GRE6_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_HIGHDMA | \ - NETIF_F_HW_CSUM) - static void ip6gre_tap_setup(struct net_device *dev) { @@ -1383,26 +1410,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, nt->net = dev_net(dev); ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); - dev->features |= GRE6_FEATURES; - dev->hw_features |= GRE6_FEATURES; - - if (!(nt->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported, nor - * can we support 2 levels of outer headers requiring - * an update. - */ - if (!(nt->parms.o_flags & TUNNEL_CSUM) || - (nt->encap.type == TUNNEL_ENCAP_NONE)) { - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - } - - /* Can use a lockless transmit, unless we generate - * output sequences - */ - dev->features |= NETIF_F_LLTX; - } - err = register_netdevice(dev); if (err) goto out; -- cgit v1.2.3 From b2fb01f426883a794ed80be9110675a2d8356347 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 20 Dec 2017 23:26:24 -0800 Subject: net_sched: fix a missing rcu barrier in mini_qdisc_pair_swap() The rcu_barrier_bh() in mini_qdisc_pair_swap() is to wait for flying RCU callback installed by a previous mini_qdisc_pair_swap(), however we miss it on the tp_head==NULL path, which leads to that the RCU callback still uses miniq_old->rcu after it is freed together with qdisc in qdisc_graft(). So just add it on that path too. Fixes: 46209401f8f6 ("net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath ") Reported-by: Jakub Kicinski Tested-by: Jakub Kicinski Cc: Jiri Pirko Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cd1b200acae7..661c7144b53a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1040,6 +1040,8 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); + /* Wait for flying RCU callback before it is freed. */ + rcu_barrier_bh(); return; } @@ -1055,7 +1057,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, rcu_assign_pointer(*miniqp->p_miniq, miniq); if (miniq_old) - /* This is counterpart of the rcu barrier above. We need to + /* This is counterpart of the rcu barriers above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ -- cgit v1.2.3 From 0a3d805c9c503e05d6e5d3868c53e92a06589dcf Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 21 Dec 2017 13:07:11 +0100 Subject: tipc: base group replicast ack counter on number of actual receivers In commit 2f487712b893 ("tipc: guarantee that group broadcast doesn't bypass group unicast") we introduced a mechanism that requires the first (replicated) broadcast sent after a unicast to be acknowledged by all receivers before permitting sending of the next (true) broadcast. The counter for keeping track of the number of acknowledges to expect is based on the tipc_group::member_cnt variable. But this misses that some of the known members may not be ready for reception, and will never acknowledge the message, either because they haven't fully joined the group or because they are leaving the group. Such members are identified by not fulfilling the condition tested for in the function tipc_group_is_enabled(). We now set the counter for the actual number of acks to receive at the moment the message is sent, by just counting the number of recipients satisfying the tipc_group_is_enabled() test. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 7ebbdeb2a90e..e5b03f08f076 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -368,18 +368,20 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; + u16 ackers = 0; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (tipc_group_is_enabled(m)) { tipc_group_update_member(m, len); m->bc_acked = prev; + ackers++; } } /* Mark number of acknowledges to expect, if any */ if (ack) - grp->bc_ackers = grp->member_cnt; + grp->bc_ackers = ackers; grp->bc_snd_nxt++; } -- cgit v1.2.3 From 4853f128c13ed2731625dff2410b7fdbe540fb26 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 21 Dec 2017 13:13:59 +0100 Subject: net: sched: fix possible null pointer deref in tcf_block_put We need to check block for being null in both tcf_block_put and tcf_block_put_ext. Fixes: 343723dd51ef ("net: sched: fix clsact init error path") Reported-by: Prashant Bhole Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b91ea03e3afa..b9d63d2246e6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -379,6 +379,8 @@ void tcf_block_put(struct tcf_block *block) { struct tcf_block_ext_info ei = {0, }; + if (!block) + return; tcf_block_put_ext(block, block->q, &ei); } -- cgit v1.2.3 From 3a33a19bf88cdfc6d982972bc6ffcf7a62c1015e Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 21 Dec 2017 14:36:34 +0100 Subject: tipc: fix memory leak of group member when peer node is lost When a group member receives a member WITHDRAW event, this might have two reasons: either the peer member is leaving the group, or the link to the member's node has been lost. In the latter case we need to issue a DOWN event to the user right away, and let function tipc_group_filter_msg() perform delete of the member item. However, in this case we miss to change the state of the member item to MBR_LEAVING, so the member item is not deleted, and we have a memory leak. We now separate better between the four sub-cases of a WITHRAW event and make sure that each case is handled correctly. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index e5b03f08f076..8e12ab55346b 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -850,17 +850,26 @@ void tipc_group_member_evt(struct tipc_group *grp, *usr_wakeup = true; m->usr_pending = false; node_up = tipc_node_is_up(net, node); - - /* Hold back event if more messages might be expected */ - if (m->state != MBR_LEAVING && node_up) { - m->event_msg = skb; - tipc_group_decr_active(grp, m); - m->state = MBR_LEAVING; - } else { - if (node_up) + m->event_msg = NULL; + + if (node_up) { + /* Hold back event if a LEAVE msg should be expected */ + if (m->state != MBR_LEAVING) { + m->event_msg = skb; + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; + } else { msg_set_grp_bc_seqno(hdr, m->bc_syncpt); - else + __skb_queue_tail(inputq, skb); + } + } else { + if (m->state != MBR_LEAVING) { + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt); + } else { + msg_set_grp_bc_seqno(hdr, m->bc_syncpt); + } __skb_queue_tail(inputq, skb); } list_del_init(&m->list); -- cgit v1.2.3 From 14e138a86f6347c6199f610576d2e11c03bec5f0 Mon Sep 17 00:00:00 2001 From: Avinash Repaka Date: Thu, 21 Dec 2017 20:17:04 -0800 Subject: RDS: Check cmsg_len before dereferencing CMSG_DATA RDS currently doesn't check if the length of the control message is large enough to hold the required data, before dereferencing the control message data. This results in following crash: BUG: KASAN: stack-out-of-bounds in rds_rdma_bytes net/rds/send.c:1013 [inline] BUG: KASAN: stack-out-of-bounds in rds_sendmsg+0x1f02/0x1f90 net/rds/send.c:1066 Read of size 8 at addr ffff8801c928fb70 by task syzkaller455006/3157 CPU: 0 PID: 3157 Comm: syzkaller455006 Not tainted 4.15.0-rc3+ #161 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430 rds_rdma_bytes net/rds/send.c:1013 [inline] rds_sendmsg+0x1f02/0x1f90 net/rds/send.c:1066 sock_sendmsg_nosec net/socket.c:628 [inline] sock_sendmsg+0xca/0x110 net/socket.c:638 ___sys_sendmsg+0x320/0x8b0 net/socket.c:2018 __sys_sendmmsg+0x1ee/0x620 net/socket.c:2108 SYSC_sendmmsg net/socket.c:2139 [inline] SyS_sendmmsg+0x35/0x60 net/socket.c:2134 entry_SYSCALL_64_fastpath+0x1f/0x96 RIP: 0033:0x43fe49 RSP: 002b:00007fffbe244ad8 EFLAGS: 00000217 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fe49 RDX: 0000000000000001 RSI: 000000002020c000 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000217 R12: 00000000004017b0 R13: 0000000000401840 R14: 0000000000000000 R15: 0000000000000000 To fix this, we verify that the cmsg_len is large enough to hold the data to be read, before proceeding further. Reported-by: syzbot Signed-off-by: Avinash Repaka Acked-by: Santosh Shilimkar Reviewed-by: Yuval Shaia Signed-off-by: David S. Miller --- net/rds/send.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index b52cdc8ae428..f72466c63f0c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1009,6 +1009,9 @@ static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) continue; if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { + if (cmsg->cmsg_len < + CMSG_LEN(sizeof(struct rds_rdma_args))) + return -EINVAL; args = CMSG_DATA(cmsg); *rdma_bytes += args->remote_vec.bytes; } -- cgit v1.2.3 From 19142551b2be4a9e13838099fde1351386e5e007 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Fri, 22 Dec 2017 09:35:16 +0200 Subject: tipc: error path leak fixes in tipc_enable_bearer() Fix memory leak in tipc_enable_bearer() if enable_media() fails, and cleanup with bearer_disable() if tipc_mon_create() fails. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tommi Rantala Signed-off-by: David S. Miller --- net/tipc/bearer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 47ec121574ce..c8001471da6c 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -324,6 +324,7 @@ restart: if (res) { pr_warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); + kfree(b); return -EINVAL; } @@ -347,8 +348,10 @@ restart: if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); - if (tipc_mon_create(net, bearer_id)) + if (tipc_mon_create(net, bearer_id)) { + bearer_disable(net, b); return -ENOMEM; + } pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, -- cgit v1.2.3 From 642a8439ddd8423b92f2e71960afe21ee1f66bb6 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Fri, 22 Dec 2017 09:35:17 +0200 Subject: tipc: fix tipc_mon_delete() oops in tipc_enable_bearer() error path Calling tipc_mon_delete() before the monitor has been created will oops. This can happen in tipc_enable_bearer() error path if tipc_disc_create() fails. [ 48.589074] BUG: unable to handle kernel paging request at 0000000000001008 [ 48.590266] IP: tipc_mon_delete+0xea/0x270 [tipc] [ 48.591223] PGD 1e60c5067 P4D 1e60c5067 PUD 1eb0cf067 PMD 0 [ 48.592230] Oops: 0000 [#1] SMP KASAN [ 48.595610] CPU: 5 PID: 1199 Comm: tipc Tainted: G B 4.15.0-rc4-pc64-dirty #5 [ 48.597176] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 [ 48.598489] RIP: 0010:tipc_mon_delete+0xea/0x270 [tipc] [ 48.599347] RSP: 0018:ffff8801d827f668 EFLAGS: 00010282 [ 48.600705] RAX: ffff8801ee813f00 RBX: 0000000000000204 RCX: 0000000000000000 [ 48.602183] RDX: 1ffffffff1de6a75 RSI: 0000000000000297 RDI: 0000000000000297 [ 48.604373] RBP: 0000000000000000 R08: 0000000000000000 R09: fffffbfff1dd1533 [ 48.605607] R10: ffffffff8eafbb05 R11: fffffbfff1dd1534 R12: 0000000000000050 [ 48.607082] R13: dead000000000200 R14: ffffffff8e73f310 R15: 0000000000001020 [ 48.608228] FS: 00007fc686484800(0000) GS:ffff8801f5540000(0000) knlGS:0000000000000000 [ 48.610189] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 48.611459] CR2: 0000000000001008 CR3: 00000001dda70002 CR4: 00000000003606e0 [ 48.612759] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 48.613831] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 48.615038] Call Trace: [ 48.615635] tipc_enable_bearer+0x415/0x5e0 [tipc] [ 48.620623] tipc_nl_bearer_enable+0x1ab/0x200 [tipc] [ 48.625118] genl_family_rcv_msg+0x36b/0x570 [ 48.631233] genl_rcv_msg+0x5a/0xa0 [ 48.631867] netlink_rcv_skb+0x1cc/0x220 [ 48.636373] genl_rcv+0x24/0x40 [ 48.637306] netlink_unicast+0x29c/0x350 [ 48.639664] netlink_sendmsg+0x439/0x590 [ 48.642014] SYSC_sendto+0x199/0x250 [ 48.649912] do_syscall_64+0xfd/0x2c0 [ 48.650651] entry_SYSCALL64_slow_path+0x25/0x25 [ 48.651843] RIP: 0033:0x7fc6859848e3 [ 48.652539] RSP: 002b:00007ffd25dff938 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 48.654003] RAX: ffffffffffffffda RBX: 00007ffd25dff990 RCX: 00007fc6859848e3 [ 48.655303] RDX: 0000000000000054 RSI: 00007ffd25dff990 RDI: 0000000000000003 [ 48.656512] RBP: 00007ffd25dff980 R08: 00007fc685c35fc0 R09: 000000000000000c [ 48.657697] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000d13010 [ 48.658840] R13: 00007ffd25e009c0 R14: 0000000000000000 R15: 0000000000000000 [ 48.662972] RIP: tipc_mon_delete+0xea/0x270 [tipc] RSP: ffff8801d827f668 [ 48.664073] CR2: 0000000000001008 [ 48.664576] ---[ end trace e811818d54d5ce88 ]--- Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tommi Rantala Signed-off-by: David S. Miller --- net/tipc/monitor.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 8e884ed06d4b..32dc33a94bc7 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -642,9 +642,13 @@ void tipc_mon_delete(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon = tipc_monitor(net, bearer_id); - struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *self; struct tipc_peer *peer, *tmp; + if (!mon) + return; + + self = get_self(net, bearer_id); write_lock_bh(&mon->lock); tn->monitors[bearer_id] = NULL; list_for_each_entry_safe(peer, tmp, &self->list, list) { -- cgit v1.2.3 From 8cb38a602478e9f806571f6920b0a3298aabf042 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 22 Dec 2017 10:15:20 -0800 Subject: sctp: Replace use of sockets_allocated with specified macro. The patch(180d8cd942ce) replaces all uses of struct sock fields' memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem to accessor macros. But the sockets_allocated field of sctp sock is not replaced at all. Then replace it now for unifying the code. Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.") Cc: Glauber Costa Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 3253f724a995..b4fb6e4886d2 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4498,7 +4498,7 @@ static int sctp_init_sock(struct sock *sk) SCTP_DBG_OBJCNT_INC(sock); local_bh_disable(); - percpu_counter_inc(&sctp_sockets_allocated); + sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); /* Nothing can fail after this block, otherwise @@ -4542,7 +4542,7 @@ static void sctp_destroy_sock(struct sock *sk) } sctp_endpoint_free(sp->ep); local_bh_disable(); - percpu_counter_dec(&sctp_sockets_allocated); + sk_sockets_allocated_dec(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); } -- cgit v1.2.3 From 517d7c79bdb39864e617960504bdc1aa560c75c6 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Thu, 28 Dec 2017 12:03:06 +0100 Subject: tipc: fix hanging poll() for stream sockets In commit 42b531de17d2f6 ("tipc: Fix missing connection request handling"), we replaced unconditional wakeup() with condtional wakeup for clients with flags POLLIN | POLLRDNORM | POLLRDBAND. This breaks the applications which do a connect followed by poll with POLLOUT flag. These applications are not woken when the connection is ESTABLISHED and hence sleep forever. In this commit, we fix it by including the POLLOUT event for sockets in TIPC_CONNECTING state. Fixes: 42b531de17d2f6 ("tipc: Fix missing connection request handling") Acked-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 41127d0b925e..3b4084480377 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -727,11 +727,11 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, switch (sk->sk_state) { case TIPC_ESTABLISHED: + case TIPC_CONNECTING: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) revents |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: - case TIPC_CONNECTING: if (!skb_queue_empty(&sk->sk_receive_queue)) revents |= POLLIN | POLLRDNORM; break; -- cgit v1.2.3 From f72c4ac695573699dde5b71da1c3b9ef80440616 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 28 Dec 2017 12:38:13 -0500 Subject: skbuff: in skb_copy_ubufs unclone before releasing zerocopy skb_copy_ubufs must unclone before it is safe to modify its skb_shared_info with skb_zcopy_clear. Commit b90ddd568792 ("skbuff: skb_copy_ubufs must release uarg even without user frags") ensures that all skbs release their zerocopy state, even those without frags. But I forgot an edge case where such an skb arrives that is cloned. The stack does not build such packets. Vhost/tun skbs have their frags orphaned before cloning. TCP skbs only attach zerocopy state when a frag is added. But if TCP packets can be trimmed or linearized, this might occur. Tracing the code I found no instance so far (e.g., skb_linearize ends up calling skb_zcopy_clear if !skb->data_len). Still, it is non-obvious that no path exists. And it is fragile to rely on this. Fixes: b90ddd568792 ("skbuff: skb_copy_ubufs must release uarg even without user frags") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a3cb0be4c6f3..08f574081315 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1177,12 +1177,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) int i, new_frags; u32 d_off; - if (!num_frags) - goto release; - if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; + if (!num_frags) + goto release; + new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < new_frags; i++) { page = alloc_page(gfp_mask); -- cgit v1.2.3 From d66fa9ec53c43bba9fa973c16419f6061b7cc3ea Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 28 Dec 2017 11:00:44 -0800 Subject: strparser: Call sock_owned_by_user_nocheck strparser wants to check socket ownership without producing any warnings. As indicated by the comment in the code, it is permissible for owned_by_user to return true. Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages") Reported-by: syzbot Reported-and-tested-by: Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/strparser/strparser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index c5fda15ba319..1fdab5c4eda8 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -401,7 +401,7 @@ void strp_data_ready(struct strparser *strp) * allows a thread in BH context to safely check if the process * lock is held. In this case, if the lock is held, queue work. */ - if (sock_owned_by_user(strp->sk)) { + if (sock_owned_by_user_nocheck(strp->sk)) { queue_work(strp_wq, &strp->work); return; } -- cgit v1.2.3