From ee89bab14e857678f83a71ee99e575b0fdbb58d4 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 9 Aug 2012 22:14:56 +0000 Subject: net: move and rename netif_notify_peers() I believe net/core/dev.c is a better place for netif_notify_peers(), because other net event notify functions also stay in this file. And rename it to netdev_notify_peers(). Cc: David S. Miller Cc: Ian Campbell Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 511323e89cec..6c4d5fe53ce8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -324,24 +324,6 @@ void netif_carrier_off(struct net_device *dev) } EXPORT_SYMBOL(netif_carrier_off); -/** - * netif_notify_peers - notify network peers about existence of @dev - * @dev: network device - * - * Generate traffic such that interested network peers are aware of - * @dev, such as by generating a gratuitous ARP. This may be used when - * a device wants to inform the rest of the network about some sort of - * reconfiguration such as a failover event or virtual machine - * migration. - */ -void netif_notify_peers(struct net_device *dev) -{ - rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); - rtnl_unlock(); -} -EXPORT_SYMBOL(netif_notify_peers); - /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. -- cgit v1.2.3 From af4c6641f5ad445fe6d0832da42406dbd9a37ce4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 25 May 2012 13:42:45 -0600 Subject: net sched: Pass the skb into change so it can access NETLINK_CB cls_flow.c plays with uids and gids. Unless I misread that code it is possible for classifiers to depend on the specific uid and gid values. Therefore I need to know the user namespace of the netlink socket that is installing the packet classifiers. Pass in the rtnetlink skb so I can access the NETLINK_CB of the passed packet. In particular I want access to sk_user_ns(NETLINK_CB(in_skb).ssk). Pass in not the user namespace but the incomming rtnetlink skb into the the classifier change routines as that is generally the more useful parameter. Cc: Jamal Hadi Salim Acked-by: David S. Miller Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- net/sched/cls_api.c | 2 +- net/sched/cls_basic.c | 3 ++- net/sched/cls_cgroup.c | 3 ++- net/sched/cls_flow.c | 3 ++- net/sched/cls_fw.c | 3 ++- net/sched/cls_route.c | 3 ++- net/sched/cls_rsvp.h | 3 ++- net/sched/cls_tcindex.c | 3 ++- net/sched/cls_u32.c | 3 ++- 9 files changed, 17 insertions(+), 9 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 6dd1131f2ec1..dc3ef5aef355 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -319,7 +319,7 @@ replay: } } - err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); + err = tp->ops->change(skb, tp, cl, t->tcm_handle, tca, &fh); if (err == 0) { if (tp_created) { spin_lock_bh(root_lock); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 590960a22a77..344a11b342e5 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -162,7 +162,8 @@ errout: return err; } -static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, +static int basic_change(struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { int err; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 7743ea8d1d38..91de66695b4a 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -151,7 +151,8 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; -static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base, +static int cls_cgroup_change(struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index ccd08c8dc6a7..ae854f3434b0 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -347,7 +347,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; -static int flow_change(struct tcf_proto *tp, unsigned long base, +static int flow_change(struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 8384a4797240..4075a0aef2aa 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -233,7 +233,8 @@ errout: return err; } -static int fw_change(struct tcf_proto *tp, unsigned long base, +static int fw_change(struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 44f405cb9aaf..c10d57bf98f2 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -427,7 +427,8 @@ errout: return err; } -static int route4_change(struct tcf_proto *tp, unsigned long base, +static int route4_change(struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 18ab93ec8d7e..494bbb90924a 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -416,7 +416,8 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, }; -static int rsvp_change(struct tcf_proto *tp, unsigned long base, +static int rsvp_change(struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index fe29420d0b0e..a1293b4ab7a1 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -332,7 +332,8 @@ errout: } static int -tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, +tcindex_change(struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { struct nlattr *opt = tca[TCA_OPTIONS]; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index d45373fb00b9..c7c27bc91b5a 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -544,7 +544,8 @@ errout: return err; } -static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, +static int u32_change(struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { -- cgit v1.2.3 From a6c6796c7127de55cfa9bb0cfbb082ec0acd4eab Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 25 May 2012 13:49:36 -0600 Subject: userns: Convert cls_flow to work with user namespaces enabled The flow classifier can use uids and gids of the sockets that are transmitting packets and do insert those uids and gids into the packet classification calcuation. I don't fully understand the details but it appears that we can depend on specific uids and gids when making traffic classification decisions. To work with user namespaces enabled map from kuids and kgids into uids and gids in the initial user namespace giving raw integer values the code can play with and depend on. To avoid issues of userspace depending on uids and gids in packet classifiers installed from other user namespaces and getting confused deny all packet classifiers that use uids or gids that are not comming from a netlink socket in the initial user namespace. Cc: Patrick McHardy Cc: Eric Dumazet Cc: Jamal Hadi Salim Cc: Changli Gao Acked-by: David S. Miller Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- net/sched/cls_flow.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index ae854f3434b0..ce82d0cb1b47 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -193,15 +193,19 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb) static u32 flow_get_skuid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) - return skb->sk->sk_socket->file->f_cred->fsuid; + if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { + kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid; + return from_kuid(&init_user_ns, skuid); + } return 0; } static u32 flow_get_skgid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) - return skb->sk->sk_socket->file->f_cred->fsgid; + if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { + kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid; + return from_kgid(&init_user_ns, skgid); + } return 0; } @@ -387,6 +391,10 @@ static int flow_change(struct sk_buff *in_skb, if (fls(keymask) - 1 > FLOW_KEY_MAX) return -EOPNOTSUPP; + + if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && + sk_user_ns(NETLINK_CB(in_skb).ssk) != &init_user_ns) + return -EOPNOTSUPP; } err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map); -- cgit v1.2.3 From 16c0b164bd24d44db137693a36b428ba28970c62 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 15 Aug 2012 20:44:27 +0000 Subject: act_mirred: do not drop packets when fails to mirror it We drop packet unconditionally when we fail to mirror it. This is not intended in some cases. Consdier for kvm guest, we may mirror the traffic of the bridge to a tap device used by a VM. When kernel fails to mirror the packet in conditions such as when qemu crashes or stop polling the tap, it's hard for the management software to detect such condition and clean the the mirroring before. This would lead all packets to the bridge to be dropped and break the netowrk of other virtual machines. To solve the issue, the patch does not drop packets when kernel fails to mirror it, and only drop the redirected packets. Signed-off-by: Jason Wang Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fe81cc18e9e0..9c0fd0c78814 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -200,13 +200,12 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, out: if (err) { m->tcf_qstats.overlimits++; - /* should we be asking for packet to be dropped? - * may make sense for redirect case only - */ - retval = TC_ACT_SHOT; - } else { + if (m->tcfm_eaction != TCA_EGRESS_MIRROR) + retval = TC_ACT_SHOT; + else + retval = m->tcf_action; + } else retval = m->tcf_action; - } spin_unlock(&m->tcf_lock); return retval; -- cgit v1.2.3 From b379135c40163ae79ba7a54e6928b53983e74ee8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Sep 2012 03:19:57 +0000 Subject: fq_codel: dont reinit flow state When fq_codel builds a new flow, it should not reset codel state. Codel algo needs to get previous values (lastcount, drop_next) to get proper behavior. Signed-off-by: Dave Taht Signed-off-by: Eric Dumazet Acked-by: Dave Taht Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 9fc1c62ec80e..4e606fcb2534 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -191,7 +191,6 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); - codel_vars_init(&flow->cvars); q->new_flow_count++; flow->deficit = q->quantum; flow->dropped = 0; @@ -418,6 +417,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) struct fq_codel_flow *flow = q->flows + i; INIT_LIST_HEAD(&flow->flowchain); + codel_vars_init(&flow->cvars); } } if (sch->limit >= 1) -- cgit v1.2.3 From 23d3b8bfb8eb20e7d96afa09991e6a5ed1c83164 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Sep 2012 01:02:56 +0000 Subject: net: qdisc busylock needs lockdep annotations It seems we need to provide ability for stacked devices to use specific lock_class_key for sch->busylock We could instead default l2tpeth tx_queue_len to 0 (no qdisc), but a user might use a qdisc anyway. (So same fixes are probably needed on non LLTX stacked drivers) Noticed while stressing L2TPV3 setup : ====================================================== [ INFO: possible circular locking dependency detected ] 3.6.0-rc3+ #788 Not tainted ------------------------------------------------------- netperf/4660 is trying to acquire lock: (l2tpsock){+.-...}, at: [] l2tp_xmit_skb+0x172/0xa50 [l2tp_core] but task is already holding lock: (&(&sch->busylock)->rlock){+.-...}, at: [] dev_queue_xmit+0xd75/0xe00 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&(&sch->busylock)->rlock){+.-...}: [] lock_acquire+0x90/0x200 [] _raw_spin_lock_irqsave+0x4c/0x60 [] __wake_up+0x32/0x70 [] tty_wakeup+0x3e/0x80 [] pty_write+0x73/0x80 [] tty_put_char+0x3c/0x40 [] process_echoes+0x142/0x330 [] n_tty_receive_buf+0x8fb/0x1230 [] flush_to_ldisc+0x142/0x1c0 [] process_one_work+0x198/0x760 [] worker_thread+0x186/0x4b0 [] kthread+0x93/0xa0 [] kernel_thread_helper+0x4/0x10 -> #0 (l2tpsock){+.-...}: [] __lock_acquire+0x1628/0x1b10 [] lock_acquire+0x90/0x200 [] _raw_spin_lock+0x41/0x50 [] l2tp_xmit_skb+0x172/0xa50 [l2tp_core] [] l2tp_eth_dev_xmit+0x32/0x60 [l2tp_eth] [] dev_hard_start_xmit+0x502/0xa70 [] sch_direct_xmit+0xfe/0x290 [] dev_queue_xmit+0x1e5/0xe00 [] ip_finish_output+0x3d0/0x890 [] ip_output+0x59/0xf0 [] ip_local_out+0x2d/0xa0 [] ip_queue_xmit+0x1c3/0x680 [] tcp_transmit_skb+0x402/0xa60 [] tcp_write_xmit+0x1f4/0xa30 [] tcp_push_one+0x30/0x40 [] tcp_sendmsg+0xe82/0x1040 [] inet_sendmsg+0x125/0x230 [] sock_sendmsg+0xdc/0xf0 [] sys_sendto+0xfe/0x130 [] system_call_fastpath+0x16/0x1b Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&(&sch->busylock)->rlock); lock(l2tpsock); lock(&(&sch->busylock)->rlock); lock(l2tpsock); *** DEADLOCK *** 5 locks held by netperf/4660: #0: (sk_lock-AF_INET){+.+.+.}, at: [] tcp_sendmsg+0x2c/0x1040 #1: (rcu_read_lock){.+.+..}, at: [] ip_queue_xmit+0x0/0x680 #2: (rcu_read_lock_bh){.+....}, at: [] ip_finish_output+0x135/0x890 #3: (rcu_read_lock_bh){.+....}, at: [] dev_queue_xmit+0x0/0xe00 #4: (&(&sch->busylock)->rlock){+.-...}, at: [] dev_queue_xmit+0xd75/0xe00 stack backtrace: Pid: 4660, comm: netperf Not tainted 3.6.0-rc3+ #788 Call Trace: [] print_circular_bug+0x1fb/0x20c [] __lock_acquire+0x1628/0x1b10 [] ? check_usage+0x9b/0x4d0 [] ? __lock_acquire+0x2e4/0x1b10 [] lock_acquire+0x90/0x200 [] ? l2tp_xmit_skb+0x172/0xa50 [l2tp_core] [] _raw_spin_lock+0x41/0x50 [] ? l2tp_xmit_skb+0x172/0xa50 [l2tp_core] [] l2tp_xmit_skb+0x172/0xa50 [l2tp_core] [] l2tp_eth_dev_xmit+0x32/0x60 [l2tp_eth] [] dev_hard_start_xmit+0x502/0xa70 [] ? dev_hard_start_xmit+0x5e/0xa70 [] ? dev_queue_xmit+0x141/0xe00 [] sch_direct_xmit+0xfe/0x290 [] dev_queue_xmit+0x1e5/0xe00 [] ? dev_hard_start_xmit+0xa70/0xa70 [] ip_finish_output+0x3d0/0x890 [] ? ip_finish_output+0x135/0x890 [] ip_output+0x59/0xf0 [] ip_local_out+0x2d/0xa0 [] ip_queue_xmit+0x1c3/0x680 [] ? ip_local_out+0xa0/0xa0 [] tcp_transmit_skb+0x402/0xa60 [] ? tcp_md5_do_lookup+0x18e/0x1a0 [] tcp_write_xmit+0x1f4/0xa30 [] tcp_push_one+0x30/0x40 [] tcp_sendmsg+0xe82/0x1040 [] inet_sendmsg+0x125/0x230 [] ? inet_create+0x6b0/0x6b0 [] ? sock_update_classid+0xc2/0x3b0 [] ? sock_update_classid+0x130/0x3b0 [] sock_sendmsg+0xdc/0xf0 [] ? fget_light+0x3f9/0x4f0 [] sys_sendto+0xfe/0x130 [] ? trace_hardirqs_on+0xd/0x10 [] ? _raw_spin_unlock_irq+0x30/0x50 [] ? finish_task_switch+0x83/0xf0 [] ? finish_task_switch+0x46/0xf0 [] ? sysret_check+0x1b/0x56 [] system_call_fastpath+0x16/0x1b Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6c4d5fe53ce8..aefc1504dc88 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -527,6 +527,8 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { }; EXPORT_SYMBOL(pfifo_fast_ops); +static struct lock_class_key qdisc_tx_busylock; + struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, struct Qdisc_ops *ops) { @@ -534,6 +536,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, struct Qdisc *sch; unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size; int err = -ENOBUFS; + struct net_device *dev = dev_queue->dev; p = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); @@ -553,12 +556,16 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); + spin_lock_init(&sch->busylock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + sch->ops = ops; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; - dev_hold(qdisc_dev(sch)); + dev_hold(dev); atomic_set(&sch->refcnt, 1); return sch; -- cgit v1.2.3 From 15e473046cb6e5d18a4d0057e61d76315230382b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 7 Sep 2012 20:12:54 +0000 Subject: netlink: Rename pid to portid to avoid confusion It is a frequent mistake to confuse the netlink port identifier with a process identifier. Try to reduce this confusion by renaming fields that hold port identifiers portid instead of pid. I have carefully avoided changing the structures exported to userspace to avoid changing the userspace API. I have successfully built an allyesconfig kernel with this change. Signed-off-by: "Eric W. Biederman" Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/act_api.c | 52 ++++++++++++++++++++++++++-------------------------- net/sched/cls_api.c | 14 +++++++------- net/sched/sch_api.c | 44 ++++++++++++++++++++++---------------------- 3 files changed, 55 insertions(+), 55 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index e3d2c78cb52c..102761d294cb 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -644,7 +644,7 @@ errout: } static int -tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, +tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { struct tcamsg *t; @@ -652,7 +652,7 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*t), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); if (!nlh) goto out_nlmsg_trim; t = nlmsg_data(nlh); @@ -678,7 +678,7 @@ out_nlmsg_trim: } static int -act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n, +act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, struct tc_action *a, int event) { struct sk_buff *skb; @@ -686,16 +686,16 @@ act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n, skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + if (tca_get_fill(skb, a, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { kfree_skb(skb); return -EINVAL; } - return rtnl_unicast(skb, net, pid); + return rtnl_unicast(skb, net, portid); } static struct tc_action * -tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid) +tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) { struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; @@ -762,7 +762,7 @@ static struct tc_action *create_a(int i) } static int tca_action_flush(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 pid) + struct nlmsghdr *n, u32 portid) { struct sk_buff *skb; unsigned char *b; @@ -799,7 +799,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (a->ops == NULL) goto err_out; - nlh = nlmsg_put(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); + nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); if (!nlh) goto out_module_put; t = nlmsg_data(nlh); @@ -823,7 +823,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, nlh->nlmsg_flags |= NLM_F_ROOT; module_put(a->ops->owner); kfree(a); - err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) return 0; @@ -841,7 +841,7 @@ noflush_out: static int tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, - u32 pid, int event) + u32 portid, int event) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; @@ -853,13 +853,13 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { if (tb[1] != NULL) - return tca_action_flush(net, tb[1], n, pid); + return tca_action_flush(net, tb[1], n, portid); else return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_get_1(tb[i], n, pid); + act = tcf_action_get_1(tb[i], n, portid); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; @@ -874,7 +874,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } if (event == RTM_GETACTION) - ret = act_get_notify(net, pid, n, head, event); + ret = act_get_notify(net, portid, n, head, event); else { /* delete */ struct sk_buff *skb; @@ -884,7 +884,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, goto err; } - if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event, + if (tca_get_fill(skb, head, portid, n->nlmsg_seq, 0, event, 0, 1) <= 0) { kfree_skb(skb); ret = -EINVAL; @@ -893,7 +893,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, /* now do the delete */ tcf_action_destroy(head, 0); - ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC, + ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (ret > 0) return 0; @@ -905,7 +905,7 @@ err: } static int tcf_add_notify(struct net *net, struct tc_action *a, - u32 pid, u32 seq, int event, u16 flags) + u32 portid, u32 seq, int event, u16 flags) { struct tcamsg *t; struct nlmsghdr *nlh; @@ -920,7 +920,7 @@ static int tcf_add_notify(struct net *net, struct tc_action *a, b = skb_tail_pointer(skb); - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*t), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); if (!nlh) goto out_kfree_skb; t = nlmsg_data(nlh); @@ -940,7 +940,7 @@ static int tcf_add_notify(struct net *net, struct tc_action *a, nlh->nlmsg_len = skb_tail_pointer(skb) - b; NETLINK_CB(skb).dst_group = RTNLGRP_TC; - err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags & NLM_F_ECHO); + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); if (err > 0) err = 0; return err; @@ -953,7 +953,7 @@ out_kfree_skb: static int tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, - u32 pid, int ovr) + u32 portid, int ovr) { int ret = 0; struct tc_action *act; @@ -971,7 +971,7 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, /* dump then free all the actions after update; inserted policy * stays intact */ - ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); + ret = tcf_add_notify(net, act, portid, seq, RTM_NEWACTION, n->nlmsg_flags); for (a = act; a; a = act) { act = a->next; kfree(a); @@ -984,7 +984,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ACT_MAX + 1]; - u32 pid = skb ? NETLINK_CB(skb).pid : 0; + u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); @@ -1008,17 +1008,17 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; replay: - ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr); + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr); if (ret == -EAGAIN) goto replay; break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, - pid, RTM_DELACTION); + portid, RTM_DELACTION); break; case RTM_GETACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, - pid, RTM_GETACTION); + portid, RTM_GETACTION); break; default: BUG(); @@ -1085,7 +1085,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) goto out_module_put; } - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) goto out_module_put; @@ -1109,7 +1109,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) nla_nest_cancel(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; - if (NETLINK_CB(cb->skb).pid && ret) + if (NETLINK_CB(cb->skb).portid && ret) nlh->nlmsg_flags |= NLM_F_MULTI; module_put(a_o->owner); return skb->len; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index dc3ef5aef355..7ae02892437c 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -343,13 +343,13 @@ errout: } static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, - unsigned long fh, u32 pid, u32 seq, u16 flags, int event) + unsigned long fh, u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*tcm), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); @@ -381,18 +381,18 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, unsigned long fh, int event) { struct sk_buff *skb; - u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { + if (tcf_fill_node(skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) { kfree_skb(skb); return -EINVAL; } - return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } @@ -407,7 +407,7 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, { struct tcf_dump_args *a = (void *)arg; - return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, + return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); } @@ -465,7 +465,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, + if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) break; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a08b4ab3e421..a18d975db59c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1185,7 +1185,7 @@ graft: } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - u32 pid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1193,7 +1193,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, struct gnet_dump d; struct qdisc_size_table *stab; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*tcm), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); @@ -1248,25 +1248,25 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct Qdisc *old, struct Qdisc *new) { struct sk_buff *skb; - u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (old && !tc_qdisc_dump_ignore(old)) { - if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, + if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new)) { - if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, + if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) goto err_out; } if (skb->len) - return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); err_out: @@ -1289,7 +1289,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, q_idx++; } else { if (!tc_qdisc_dump_ignore(q) && - tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, + tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) goto done; q_idx++; @@ -1300,7 +1300,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, continue; } if (!tc_qdisc_dump_ignore(q) && - tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, + tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) goto done; q_idx++; @@ -1375,7 +1375,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) const struct Qdisc_class_ops *cops; unsigned long cl = 0; unsigned long new_cl; - u32 pid = tcm->tcm_parent; + u32 portid = tcm->tcm_parent; u32 clid = tcm->tcm_handle; u32 qid = TC_H_MAJ(clid); int err; @@ -1403,8 +1403,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* Step 1. Determine qdisc handle X:0 */ - if (pid != TC_H_ROOT) { - u32 qid1 = TC_H_MAJ(pid); + if (portid != TC_H_ROOT) { + u32 qid1 = TC_H_MAJ(portid); if (qid && qid1) { /* If both majors are known, they must be identical. */ @@ -1418,10 +1418,10 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* Now qid is genuine qdisc handle consistent * both with parent and child. * - * TC_H_MAJ(pid) still may be unspecified, complete it now. + * TC_H_MAJ(portid) still may be unspecified, complete it now. */ - if (pid) - pid = TC_H_MAKE(qid, pid); + if (portid) + portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) qid = dev->qdisc->handle; @@ -1439,7 +1439,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* Now try to get class */ if (clid == 0) { - if (pid == TC_H_ROOT) + if (portid == TC_H_ROOT) clid = qid; } else clid = TC_H_MAKE(qid, clid); @@ -1478,7 +1478,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) new_cl = cl; err = -EOPNOTSUPP; if (cops->change) - err = cops->change(q, clid, pid, tca, &new_cl); + err = cops->change(q, clid, portid, tca, &new_cl); if (err == 0) tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); @@ -1492,7 +1492,7 @@ out: static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, unsigned long cl, - u32 pid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1500,7 +1500,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, struct gnet_dump d; const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*tcm), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); @@ -1540,18 +1540,18 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, unsigned long cl, int event) { struct sk_buff *skb; - u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) { kfree_skb(skb); return -EINVAL; } - return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } @@ -1565,7 +1565,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walk { struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; - return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, + return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); } -- cgit v1.2.3 From bdfc87f7d1e253e0a61e2fc6a75ea9d76f7fc03a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 Sep 2012 13:11:12 +0000 Subject: net-sched: sch_cbq: avoid infinite loop Its possible to setup a bad cbq configuration leading to an infinite loop in cbq_classify() DEV_OUT=eth0 ICMP="match ip protocol 1 0xff" U32="protocol ip u32" DST="match ip dst" tc qdisc add dev $DEV_OUT root handle 1: cbq avpkt 1000 \ bandwidth 100mbit tc class add dev $DEV_OUT parent 1: classid 1:1 cbq \ rate 512kbit allot 1500 prio 5 bounded isolated tc filter add dev $DEV_OUT parent 1: prio 3 $U32 \ $ICMP $DST 192.168.3.234 flowid 1: Reported-by: Denys Fedoryschenko Tested-by: Denys Fedoryschenko Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6aabd77d1cfd..564b9fc8efd3 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -250,10 +250,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) cl = defmap[TC_PRIO_BESTEFFORT]; - if (cl == NULL || cl->level >= head->level) + if (cl == NULL) goto fallback; } - + if (cl->level >= head->level) + goto fallback; #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: -- cgit v1.2.3 From e29fe837bfa3ca0a9f4ef1d4a90e6e0a74b6b8a0 Mon Sep 17 00:00:00 2001 From: David Ward Date: Thu, 13 Sep 2012 05:22:32 +0000 Subject: net_sched: gred: correct comment about qavg calculation in RIO mode Signed-off-by: David Ward Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index e901583e4ea5..fca73cdf44d9 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -176,7 +176,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } - /* sum up all the qaves of prios <= to ours to get the new qave */ + /* sum up all the qaves of prios < ours to get the new qave */ if (!gred_wred_mode(t) && gred_rio_mode(t)) { int i; -- cgit v1.2.3 From c22e464022f935b0cbd8724b1d99d800d49518a9 Mon Sep 17 00:00:00 2001 From: David Ward Date: Thu, 13 Sep 2012 05:22:33 +0000 Subject: net_sched: gred: eliminate redundant DP prio comparisons Each pair of DPs only needs to be compared once when searching for a non-unique prio value. Signed-off-by: David Ward Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index fca73cdf44d9..e19d4ebfea1c 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -102,9 +102,8 @@ static inline int gred_wred_mode_check(struct Qdisc *sch) if (q == NULL) continue; - for (n = 0; n < table->DPs; n++) - if (table->tab[n] && table->tab[n] != q && - table->tab[n]->prio == q->prio) + for (n = i + 1; n < table->DPs; n++) + if (table->tab[n] && table->tab[n]->prio == q->prio) return 1; } -- cgit v1.2.3 From 1fe37b106b039d9358fd1211c39b1fa199e547a8 Mon Sep 17 00:00:00 2001 From: David Ward Date: Thu, 13 Sep 2012 05:22:34 +0000 Subject: net_sched: gred: fix qave reporting via netlink q->vars.qavg is a Wlog scaled value, but q->backlog is not. In order to pass q->vars.qavg as the backlog value, we need to un-scale it. Additionally, the qave value returned via netlink should not be Wlog scaled, so we need to un-scale the result of red_calc_qavg(). This caused artificially high values for "Average Queue" to be shown by 'tc -s -d qdisc', but did not affect the actual operation of GRED. Signed-off-by: David Ward Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index e19d4ebfea1c..b2570b59d85e 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -534,6 +534,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; struct tc_gred_qopt opt; + unsigned long qavg; memset(&opt, 0, sizeof(opt)); @@ -565,7 +566,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (gred_wred_mode(table)) gred_load_wred_set(table, q); - opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg); + qavg = red_calc_qavg(&q->parms, &q->vars, + q->vars.qavg >> q->parms.Wlog); + opt.qave = qavg >> q->parms.Wlog; append_opt: if (nla_append(skb, sizeof(opt), &opt) < 0) -- cgit v1.2.3 From ba1bf474eae07a128fae6252bf7d68eaef0eacf8 Mon Sep 17 00:00:00 2001 From: David Ward Date: Thu, 13 Sep 2012 05:22:35 +0000 Subject: net_sched: gred: actually perform idling in WRED mode gred_dequeue() and gred_drop() do not seem to get called when the queue is empty, meaning that we never start idling while in WRED mode. And since qidlestart is not stored by gred_store_wred_set(), we would never stop idling while in WRED mode if we ever started. This messes up the average queue size calculation that influences packet marking/dropping behavior. Now, we start WRED mode idling as we are removing the last packet from the queue. Also we now actually stop WRED mode idling when we are enqueuing a packet. Cc: Bruce Osler Signed-off-by: David Ward Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index b2570b59d85e..d42234c0f13b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -136,6 +136,7 @@ static inline void gred_store_wred_set(struct gred_sched *table, struct gred_sched_data *q) { table->wred_set.qavg = q->vars.qavg; + table->wred_set.qidlestart = q->vars.qidlestart; } static inline int gred_use_ecn(struct gred_sched *t) @@ -259,16 +260,18 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch) } else { q->backlog -= qdisc_pkt_len(skb); - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->vars); + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } return skb; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return NULL; } @@ -290,19 +293,20 @@ static unsigned int gred_drop(struct Qdisc *sch) q->backlog -= len; q->stats.other++; - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->vars); + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } qdisc_drop(skb, sch); return len; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return 0; - } static void gred_reset(struct Qdisc *sch) -- cgit v1.2.3 From 8a8e04df4747661daaee77e98e102d99c9e09b98 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 Sep 2012 16:12:07 +0200 Subject: cgroup: Assign subsystem IDs during compile time WARNING: With this change it is impossible to load external built controllers anymore. In case where CONFIG_NETPRIO_CGROUP=m and CONFIG_NET_CLS_CGROUP=m is set, corresponding subsys_id should also be a constant. Up to now, net_prio_subsys_id and net_cls_subsys_id would be of the type int and the value would be assigned during runtime. By switching the macro definition IS_SUBSYS_ENABLED from IS_BUILTIN to IS_ENABLED, all *_subsys_id will have constant value. That means we need to remove all the code which assumes a value can be assigned to net_prio_subsys_id and net_cls_subsys_id. A close look is necessary on the RCU part which was introduces by following patch: commit f845172531fb7410c7fb7780b1a6e51ee6df7d52 Author: Herbert Xu Mon May 24 09:12:34 2010 Committer: David S. Miller Mon May 24 09:12:34 2010 cls_cgroup: Store classid in struct sock Tis code was added to init_cgroup_cls() /* We can't use rcu_assign_pointer because this is an int. */ smp_wmb(); net_cls_subsys_id = net_cls_subsys.subsys_id; respectively to exit_cgroup_cls() net_cls_subsys_id = -1; synchronize_rcu(); and in module version of task_cls_classid() rcu_read_lock(); id = rcu_dereference(net_cls_subsys_id); if (id >= 0) classid = container_of(task_subsys_state(p, id), struct cgroup_cls_state, css)->classid; rcu_read_unlock(); Without an explicit explaination why the RCU part is needed. (The rcu_deference was fixed by exchanging it to rcu_derefence_index_check() in a later commit, but that is a minor detail.) So here is my pondering why it was introduced and why it safe to remove it now. Note that this code was copied over to net_prio the reasoning holds for that subsystem too. The idea behind the RCU use for net_cls_subsys_id is to make sure we get a valid pointer back from task_subsys_state(). task_subsys_state() is just blindly accessing the subsys array and returning the pointer. Obviously, passing in -1 as id into task_subsys_state() returns an invalid value (out of lower bound). So this code makes sure that only after module is loaded and the subsystem registered, the id is assigned. Before unregistering the module all old readers must have left the critical section. This is done by assigning -1 to the id and issuing a synchronized_rcu(). Any new readers wont call task_subsys_state() anymore and therefore it is safe to unregister the subsystem. The new code relies on the same trick, but it looks at the subsys pointer return by task_subsys_state() (remember the id is constant and therefore we allways have a valid index into the subsys array). No precautions need to be taken during module loading module. Eventually, all CPUs will get a valid pointer back from task_subsys_state() because rebind_subsystem() which is called after the module init() function will assigned subsys[net_cls_subsys_id] the newly loaded module subsystem pointer. When the subsystem is about to be removed, rebind_subsystem() will called before the module exit() function. In this case, rebind_subsys() will assign subsys[net_cls_subsys_id] a NULL pointer and then it calls synchronize_rcu(). All old readers have left by then the critical section. Any new reader wont access the subsystem anymore. At this point we are safe to unregister the subsystem. No synchronize_rcu() call is needed. Signed-off-by: Daniel Wagner Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Cc: "David S. Miller" Cc: "Paul E. McKenney" Cc: Andrew Morton Cc: Eric Dumazet Cc: Gao feng Cc: Glauber Costa Cc: Herbert Xu Cc: Jamal Hadi Salim Cc: John Fastabend Cc: Kamezawa Hiroyuki Cc: netdev@vger.kernel.org Cc: cgroups@vger.kernel.org --- net/sched/cls_cgroup.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 7743ea8d1d38..67cf90d962f4 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -77,9 +77,7 @@ struct cgroup_subsys net_cls_subsys = { .name = "net_cls", .create = cgrp_create, .destroy = cgrp_destroy, -#ifdef CONFIG_NET_CLS_CGROUP .subsys_id = net_cls_subsys_id, -#endif .base_cftypes = ss_files, .module = THIS_MODULE, }; @@ -283,12 +281,6 @@ static int __init init_cgroup_cls(void) if (ret) goto out; -#ifndef CONFIG_NET_CLS_CGROUP - /* We can't use rcu_assign_pointer because this is an int. */ - smp_wmb(); - net_cls_subsys_id = net_cls_subsys.subsys_id; -#endif - ret = register_tcf_proto_ops(&cls_cgroup_ops); if (ret) cgroup_unload_subsys(&net_cls_subsys); @@ -301,11 +293,6 @@ static void __exit exit_cgroup_cls(void) { unregister_tcf_proto_ops(&cls_cgroup_ops); -#ifndef CONFIG_NET_CLS_CGROUP - net_cls_subsys_id = -1; - synchronize_rcu(); -#endif - cgroup_unload_subsys(&net_cls_subsys); } -- cgit v1.2.3 From 8c7f6edbda01f1b1a2e60ad61f14fe38023e433b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Sep 2012 12:20:58 -0700 Subject: cgroup: mark subsystems with broken hierarchy support and whine if cgroups are nested for them Currently, cgroup hierarchy support is a mess. cpu related subsystems behave correctly - configuration, accounting and control on a parent properly cover its children. blkio and freezer completely ignore hierarchy and treat all cgroups as if they're directly under the root cgroup. Others show yet different behaviors. These differing interpretations of cgroup hierarchy make using cgroup confusing and it impossible to co-mount controllers into the same hierarchy and obtain sane behavior. Eventually, we want full hierarchy support from all subsystems and probably a unified hierarchy. Users using separate hierarchies expecting completely different behaviors depending on the mounted subsystem is deterimental to making any progress on this front. This patch adds cgroup_subsys.broken_hierarchy and sets it to %true for controllers which are lacking in hierarchy support. The goal of this patch is two-fold. * Move users away from using hierarchy on currently non-hierarchical subsystems, so that implementing proper hierarchy support on those doesn't surprise them. * Keep track of which controllers are broken how and nudge the subsystems to implement proper hierarchy support. For now, start with a single warning message. We can whine louder later on. v2: Fixed a typo spotted by Michal. Warning message updated. v3: Updated memcg part so that it doesn't generate warning in the cases where .use_hierarchy=false doesn't make the behavior different from root.use_hierarchy=true. Fixed a typo spotted by Glauber. v4: Check ->broken_hierarchy after cgroup creation is complete so that ->create() can affect the result per Michal. Dropped unnecessary memcg root handling per Michal. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Li Zefan Acked-by: Serge E. Hallyn Cc: Glauber Costa Cc: Peter Zijlstra Cc: Paul Turner Cc: Johannes Weiner Cc: Thomas Graf Cc: Vivek Goyal Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Neil Horman Cc: Aneesh Kumar K.V --- net/sched/cls_cgroup.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/sched') diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 7743ea8d1d38..907daf99ab2e 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -82,6 +82,15 @@ struct cgroup_subsys net_cls_subsys = { #endif .base_cftypes = ss_files, .module = THIS_MODULE, + + /* + * While net_cls cgroup has the rudimentary hierarchy support of + * inheriting the parent's classid on cgroup creation, it doesn't + * properly propagates config changes in ancestors to their + * descendents. A child should follow the parent's configuration + * but be allowed to override it. Fix it and remove the following. + */ + .broken_hierarchy = true, }; struct cls_cgroup_head { -- cgit v1.2.3 From 71261956973ba9e0637848a5adb4a5819b4bae83 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 15 Sep 2012 00:41:35 +0000 Subject: pkt_sched: fix virtual-start-time update in QFQ If the old timestamps of a class, say cl, are stale when the class becomes active, then QFQ may assign to cl a much higher start time than the maximum value allowed. This may happen when QFQ assigns to the start time of cl the finish time of a group whose classes are characterized by a higher value of the ratio max_class_pkt/weight_of_the_class with respect to that of cl. Inserting a class with a too high start time into the bucket list corrupts the data structure and may eventually lead to crashes. This patch limits the maximum start time assigned to a class. Signed-off-by: Paolo Valente Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index e4723d31fdd5..211a21217045 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -865,7 +865,10 @@ static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { - cl->S = next->F; + if (qfq_gt(limit, next->F)) + cl->S = next->F; + else /* preserve timestamp correctness */ + cl->S = limit; return; } } -- cgit v1.2.3 From 5640f7685831e088fe6c2e1f863a6805962f8e81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 23 Sep 2012 23:04:42 +0000 Subject: net: use a per task frag allocator We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Vijay Subramanian Cc: Alexander Duyck Tested-by: Vijay Subramanian Signed-off-by: David S. Miller --- net/sched/em_meta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 4ab6e3325573..7c3de6ffa516 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -461,7 +461,7 @@ META_COLLECTOR(int_sk_sndtimeo) META_COLLECTOR(int_sk_sendmsg_off) { SKIP_NONLOCAL(skb); - dst->value = skb->sk->sk_sndmsg_off; + dst->value = skb->sk->sk_frag.offset; } META_COLLECTOR(int_sk_write_pend) -- cgit v1.2.3 From f54ba7798848ce1385a71b36a2c997422c82220a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 27 Sep 2012 18:35:47 -0400 Subject: pkt_sched: Fix warning false positives. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC refuses to recognize that all error control flows do in fact set err to something. Add an explicit initialization to shut it up. net/sched/sch_drr.c: In function ‘drr_enqueue’: net/sched/sch_drr.c:359:11: warning: ‘err’ may be used uninitialized in this function [-Wmaybe-uninitialized] net/sched/sch_qfq.c: In function ‘qfq_enqueue’: net/sched/sch_qfq.c:885:11: warning: ‘err’ may be used uninitialized in this function [-Wmaybe-uninitialized] Signed-off-by: David S. Miller --- net/sched/sch_drr.c | 2 +- net/sched/sch_qfq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 9ce0b4fe23ff..71e50c80315f 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -352,7 +352,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - int err; + int err = 0; cl = drr_classify(skb, sch, &err); if (cl == NULL) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index e4723d31fdd5..25566208f12b 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -878,7 +878,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; - int err; + int err = 0; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { -- cgit v1.2.3