From 63c82997f5c0f3e1b914af43d82f712a86bc5f3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 Nov 2018 21:06:26 -0800 Subject: net: sched: cls_flower: validate nested enc_opts_policy to avoid warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCA_FLOWER_KEY_ENC_OPTS and TCA_FLOWER_KEY_ENC_OPTS_MASK can only currently contain further nested attributes, which are parsed by hand, so the policy is never actually used resulting in a W=1 build warning: net/sched/cls_flower.c:492:1: warning: ‘enc_opts_policy’ defined but not used [-Wunused-const-variable=] enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { Add the validation anyway to avoid potential bugs when other attributes are added and to make the attribute structure slightly more clear. Validation will also set extact to point to bad attribute on error. Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options") Signed-off-by: Jakub Kicinski Acked-by: Simon Horman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9aada2d0ef06..c6c327874abc 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -709,11 +709,23 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct netlink_ext_ack *extack) { const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL; - int option_len, key_depth, msk_depth = 0; + int err, option_len, key_depth, msk_depth = 0; + + err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]); if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) { + err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; + nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); } -- cgit v1.2.3 From 7236ead1b14923f3ba35cd29cce13246be83f451 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 10 Nov 2018 16:22:29 -0800 Subject: act_mirred: clear skb->tstamp on redirect If sch_fq is used at ingress, skbs that might have been timestamped by net_timestamp_set() if a packet capture is requesting timestamps could be delayed by arbitrary amount of time, since sch_fq time base is MONOTONIC. Fix this problem by moving code from sch_netem.c to act_mirred.c. Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 3 ++- net/sched/sch_netem.c | 9 --------- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1dae5f2b358f..c8cf4d10c435 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -258,7 +258,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (is_redirect) { skb2->tc_redirected = 1; skb2->tc_from_ingress = skb2->tc_at_ingress; - + if (skb2->tc_from_ingress) + skb2->tstamp = 0; /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 57b3ad9394ad..2c38e3d07924 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -648,15 +648,6 @@ deliver: */ skb->dev = qdisc_dev(sch); -#ifdef CONFIG_NET_CLS_ACT - /* - * If it's at ingress let's pretend the delay is - * from the network (tstamp will be updated). - */ - if (skb->tc_redirected && skb->tc_from_ingress) - skb->tstamp = 0; -#endif - if (q->slot.slot_next) { q->slot.packets_left--; q->slot.bytes_left -= qdisc_pkt_len(skb); -- cgit v1.2.3 From 08e14fe429a07475ee9f29a283945d602e4a6d92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 12 Nov 2018 16:17:16 -0800 Subject: net_sched: sch_fq: ensure maxrate fq parameter applies to EDT flows When EDT conversion happened, fq lost the ability to enfore a maxrate for all flows. It kept it for non EDT flows. This commit restores the functionality. Tested: tc qd replace dev eth0 root fq maxrate 500Mbit netperf -P0 -H host -- -O THROUGHPUT 489.75 Fixes: ab408b6dc744 ("tcp: switch tcp and sch_fq to new earliest departure time model") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4b1af706896c..25a7cf6d380f 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -469,22 +469,29 @@ begin: goto begin; } prefetch(&skb->end); - f->credit -= qdisc_pkt_len(skb); + plen = qdisc_pkt_len(skb); + f->credit -= plen; - if (ktime_to_ns(skb->tstamp) || !q->rate_enable) + if (!q->rate_enable) goto out; rate = q->flow_max_rate; - if (skb->sk) - rate = min(skb->sk->sk_pacing_rate, rate); - - if (rate <= q->low_rate_threshold) { - f->credit = 0; - plen = qdisc_pkt_len(skb); - } else { - plen = max(qdisc_pkt_len(skb), q->quantum); - if (f->credit > 0) - goto out; + + /* If EDT time was provided for this skb, we need to + * update f->time_next_packet only if this qdisc enforces + * a flow max rate. + */ + if (!skb->tstamp) { + if (skb->sk) + rate = min(skb->sk->sk_pacing_rate, rate); + + if (rate <= q->low_rate_threshold) { + f->credit = 0; + } else { + plen = max(plen, q->quantum); + if (f->credit > 0) + goto out; + } } if (rate != ~0UL) { u64 len = (u64)plen * NSEC_PER_SEC; -- cgit v1.2.3 From 19ab69107d3ecfb7cd3e38ad262a881be40c01a3 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 14 Nov 2018 12:17:25 +0100 Subject: net/sched: act_pedit: fix memory leak when IDR allocation fails tcf_idr_check_alloc() can return a negative value, on allocation failures (-ENOMEM) or IDR exhaustion (-ENOSPC): don't leak keys_ex in these cases. Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action") Signed-off-by: Davide Caratti Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index da3dd0f68cc2..2b372a06b432 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -201,7 +201,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_release; } } else { - return err; + ret = err; + goto out_free; } p = to_pedit(*a); -- cgit v1.2.3 From f2cbd485282014132851bf37cb2ca624a456275d Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 20 Nov 2018 22:18:44 +0100 Subject: net/sched: act_police: fix race condition on state variables after 'police' configuration parameters were converted to use RCU instead of spinlock, the state variables used to compute the traffic rate (namely 'tcfp_toks', 'tcfp_ptoks' and 'tcfp_t_c') are erroneously read/updated in the traffic path without any protection. Use a dedicated spinlock to avoid race conditions on these variables, and ensure proper cache-line alignment. In this way, 'police' is still faster than what we observed when 'tcf_lock' was used in the traffic path _ i.e. reverting commit 2d550dbad83c ("net/sched: act_police: don't use spinlock in the data path"). Moreover, we preserve the throughput improvement that was obtained after 'police' started using per-cpu counters, when 'avrate' is used instead of 'rate'. Changes since v1 (thanks to Eric Dumazet): - call ktime_get_ns() before acquiring the lock in the traffic path - use a dedicated spinlock instead of tcf_lock - improve cache-line usage Fixes: 2d550dbad83c ("net/sched: act_police: don't use spinlock in the data path") Reported-and-suggested-by: Eric Dumazet Signed-off-by: Davide Caratti Reviewed-by: Eric Dumazet --- net/sched/act_police.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 052855d47354..ee4665a5a022 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -27,10 +27,7 @@ struct tcf_police_params { u32 tcfp_ewma_rate; s64 tcfp_burst; u32 tcfp_mtu; - s64 tcfp_toks; - s64 tcfp_ptoks; s64 tcfp_mtu_ptoks; - s64 tcfp_t_c; struct psched_ratecfg rate; bool rate_present; struct psched_ratecfg peak; @@ -41,6 +38,11 @@ struct tcf_police_params { struct tcf_police { struct tc_action common; struct tcf_police_params __rcu *params; + + spinlock_t tcfp_lock ____cacheline_aligned_in_smp; + s64 tcfp_toks; + s64 tcfp_ptoks; + s64 tcfp_t_c; }; #define to_police(pc) ((struct tcf_police *)pc) @@ -186,12 +188,9 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } new->tcfp_burst = PSCHED_TICKS2NS(parm->burst); - new->tcfp_toks = new->tcfp_burst; - if (new->peak_present) { + if (new->peak_present) new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak, new->tcfp_mtu); - new->tcfp_ptoks = new->tcfp_mtu_ptoks; - } if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); @@ -207,7 +206,12 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } spin_lock_bh(&police->tcf_lock); - new->tcfp_t_c = ktime_get_ns(); + spin_lock_bh(&police->tcfp_lock); + police->tcfp_t_c = ktime_get_ns(); + police->tcfp_toks = new->tcfp_burst; + if (new->peak_present) + police->tcfp_ptoks = new->tcfp_mtu_ptoks; + spin_unlock_bh(&police->tcfp_lock); police->tcf_action = parm->action; rcu_swap_protected(police->params, new, @@ -257,25 +261,28 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, } now = ktime_get_ns(); - toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst); + spin_lock_bh(&police->tcfp_lock); + toks = min_t(s64, now - police->tcfp_t_c, p->tcfp_burst); if (p->peak_present) { - ptoks = toks + p->tcfp_ptoks; + ptoks = toks + police->tcfp_ptoks; if (ptoks > p->tcfp_mtu_ptoks) ptoks = p->tcfp_mtu_ptoks; ptoks -= (s64)psched_l2t_ns(&p->peak, qdisc_pkt_len(skb)); } - toks += p->tcfp_toks; + toks += police->tcfp_toks; if (toks > p->tcfp_burst) toks = p->tcfp_burst; toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); if ((toks|ptoks) >= 0) { - p->tcfp_t_c = now; - p->tcfp_toks = toks; - p->tcfp_ptoks = ptoks; + police->tcfp_t_c = now; + police->tcfp_toks = toks; + police->tcfp_ptoks = ptoks; + spin_unlock_bh(&police->tcfp_lock); ret = p->tcfp_result; goto inc_drops; } + spin_unlock_bh(&police->tcfp_lock); } inc_overlimits: -- cgit v1.2.3 From 484afd1bd3fc6f9f5347289fc8b285aa65f67054 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 21 Nov 2018 18:23:53 +0100 Subject: net/sched: act_police: add missing spinlock initialization commit f2cbd4852820 ("net/sched: act_police: fix race condition on state variables") introduces a new spinlock, but forgets its initialization. Ensure that tcf_police_init() initializes 'tcfp_lock' every time a 'police' action is newly created, to avoid the following lockdep splat: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. <...> Call Trace: dump_stack+0x85/0xcb register_lock_class+0x581/0x590 __lock_acquire+0xd4/0x1330 ? tcf_police_init+0x2fa/0x650 [act_police] ? lock_acquire+0x9e/0x1a0 lock_acquire+0x9e/0x1a0 ? tcf_police_init+0x2fa/0x650 [act_police] ? tcf_police_init+0x55a/0x650 [act_police] _raw_spin_lock_bh+0x34/0x40 ? tcf_police_init+0x2fa/0x650 [act_police] tcf_police_init+0x2fa/0x650 [act_police] tcf_action_init_1+0x384/0x4c0 tcf_action_init+0xf6/0x160 tcf_action_add+0x73/0x170 tc_ctl_action+0x122/0x160 rtnetlink_rcv_msg+0x2a4/0x490 ? netlink_deliver_tap+0x99/0x400 ? validate_linkmsg+0x370/0x370 netlink_rcv_skb+0x4d/0x130 netlink_unicast+0x196/0x230 netlink_sendmsg+0x2e5/0x3e0 sock_sendmsg+0x36/0x40 ___sys_sendmsg+0x280/0x2f0 ? _raw_spin_unlock+0x24/0x30 ? handle_pte_fault+0xafe/0xf30 ? find_held_lock+0x2d/0x90 ? syscall_trace_enter+0x1df/0x360 ? __sys_sendmsg+0x5e/0xa0 __sys_sendmsg+0x5e/0xa0 do_syscall_64+0x60/0x210 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f1841c7cf10 Code: c3 48 8b 05 82 6f 2c 00 f7 db 64 89 18 48 83 cb ff eb dd 0f 1f 80 00 00 00 00 83 3d 8d d0 2c 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ae cc 00 00 48 89 04 24 RSP: 002b:00007ffcf9df4d68 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f1841c7cf10 RDX: 0000000000000000 RSI: 00007ffcf9df4dc0 RDI: 0000000000000003 RBP: 000000005bf56105 R08: 0000000000000002 R09: 00007ffcf9df8edc R10: 00007ffcf9df47e0 R11: 0000000000000246 R12: 0000000000671be0 R13: 00007ffcf9df4e84 R14: 0000000000000008 R15: 0000000000000000 Fixes: f2cbd4852820 ("net/sched: act_police: fix race condition on state variables") Reported-by: Cong Wang Signed-off-by: Davide Caratti Acked-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_police.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/sched') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index ee4665a5a022..37c9b8f0e10f 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -124,6 +124,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; + spin_lock_init(&(to_police(*a)->tcfp_lock)); } else if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; -- cgit v1.2.3