From 90b2f49a502fa71090d9f4fe29a2f51fe5dff76d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:37 +0000 Subject: openvswitch: use RCU protection in ovs_vport_cmd_fill_info() ovs_vport_cmd_fill_info() can be called without RTNL or RCU. Use RCU protection and dev_net_rcu() to avoid potential UAF. Fixes: 9354d4520342 ("openvswitch: reliable interface indentification in port dumps") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 225f6048867f..5d548eda742d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2101,6 +2101,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; + struct net *net_vport; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, @@ -2117,12 +2118,15 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; - if (!net_eq(net, dev_net(vport->dev))) { - int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); + rcu_read_lock(); + net_vport = dev_net_rcu(vport->dev); + if (!net_eq(net, net_vport)) { + int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) - goto nla_put_failure; + goto nla_put_failure_unlock; } + rcu_read_unlock(); ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, @@ -2143,6 +2147,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, genlmsg_end(skb, ovs_header); return 0; +nla_put_failure_unlock: + rcu_read_unlock(); nla_put_failure: err = -EMSGSIZE; error: -- cgit v1.2.3 From a1e64addf3ff9257b45b78bc7d743781c3f41340 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Sat, 8 Mar 2025 01:45:59 +0100 Subject: net: openvswitch: remove misbehaving actions length check The actions length check is unreliable and produces different results depending on the initial length of the provided netlink attribute and the composition of the actual actions inside of it. For example, a user can add 4088 empty clone() actions without triggering -EMSGSIZE, on attempt to add 4089 such actions the operation will fail with the -EMSGSIZE verdict. However, if another 16 KB of other actions will be *appended* to the previous 4089 clone() actions, the check passes and the flow is successfully installed into the openvswitch datapath. The reason for a such a weird behavior is the way memory is allocated. When ovs_flow_cmd_new() is invoked, it calls ovs_nla_copy_actions(), that in turn calls nla_alloc_flow_actions() with either the actual length of the user-provided actions or the MAX_ACTIONS_BUFSIZE. The function adds the size of the sw_flow_actions structure and then the actually allocated memory is rounded up to the closest power of two. So, if the user-provided actions are larger than MAX_ACTIONS_BUFSIZE, then MAX_ACTIONS_BUFSIZE + sizeof(*sfa) rounded up is 32K + 24 -> 64K. Later, while copying individual actions, we look at ksize(), which is 64K, so this way the MAX_ACTIONS_BUFSIZE check is not actually triggered and the user can easily allocate almost 64 KB of actions. However, when the initial size is less than MAX_ACTIONS_BUFSIZE, but the actions contain ones that require size increase while copying (such as clone() or sample()), then the limit check will be performed during the reserve_sfa_size() and the user will not be allowed to create actions that yield more than 32 KB internally. This is one part of the problem. The other part is that it's not actually possible for the userspace application to know beforehand if the particular set of actions will be rejected or not. Certain actions require more space in the internal representation, e.g. an empty clone() takes 4 bytes in the action list passed in by the user, but it takes 12 bytes in the internal representation due to an extra nested attribute, and some actions require less space in the internal representations, e.g. set(tunnel(..)) normally takes 64+ bytes in the action list provided by the user, but only needs to store a single pointer in the internal implementation, since all the data is stored in the tunnel_info structure instead. And the action size limit is applied to the internal representation, not to the action list passed by the user. So, it's not possible for the userpsace application to predict if the certain combination of actions will be rejected or not, because it is not possible for it to calculate how much space these actions will take in the internal representation without knowing kernel internals. All that is causing random failures in ovs-vswitchd in userspace and inability to handle certain traffic patterns as a result. For example, it is reported that adding a bit more than a 1100 VMs in an OpenStack setup breaks the network due to OVS not being able to handle ARP traffic anymore in some cases (it tries to install a proper datapath flow, but the kernel rejects it with -EMSGSIZE, even though the action list isn't actually that large.) Kernel behavior must be consistent and predictable in order for the userspace application to use it in a reasonable way. ovs-vswitchd has a mechanism to re-direct parts of the traffic and partially handle it in userspace if the required action list is oversized, but that doesn't work properly if we can't actually tell if the action list is oversized or not. Solution for this is to check the size of the user-provided actions instead of the internal representation. This commit just removes the check from the internal part because there is already an implicit size check imposed by the netlink protocol. The attribute can't be larger than 64 KB. Realistically, we could reduce the limit to 32 KB, but we'll be risking to break some existing setups that rely on the fact that it's possible to create nearly 64 KB action lists today. Vast majority of flows in real setups are below 100-ish bytes. So removal of the limit will not change real memory consumption on the system. The absolutely worst case scenario is if someone adds a flow with 64 KB of empty clone() actions. That will yield a 192 KB in the internal representation consuming 256 KB block of memory. However, that list of actions is not meaningful and also a no-op. Real world very large action lists (that can occur for a rare cases of BUM traffic handling) are unlikely to contain a large number of clones and will likely have a lot of tunnel attributes making the internal representation comparable in size to the original action list. So, it should be fine to just remove the limit. Commit in the 'Fixes' tag is the first one that introduced the difference between internal representation and the user-provided action lists, but there were many more afterwards that lead to the situation we have today. Fixes: 7d5437c709de ("openvswitch: Add tunneling interface.") Signed-off-by: Ilya Maximets Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20250308004609.2881861-1-i.maximets@ovn.org Signed-off-by: Paolo Abeni --- net/openvswitch/flow_netlink.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 881ddd3696d5..95e0dd14dc1a 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2317,14 +2317,10 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) OVS_FLOW_ATTR_MASK, true, skb); } -#define MAX_ACTIONS_BUFSIZE (32 * 1024) - static struct sw_flow_actions *nla_alloc_flow_actions(int size) { struct sw_flow_actions *sfa; - WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE); - sfa = kmalloc(kmalloc_size_roundup(sizeof(*sfa) + size), GFP_KERNEL); if (!sfa) return ERR_PTR(-ENOMEM); @@ -2480,15 +2476,6 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); - if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) { - OVS_NLERR(log, "Flow action size exceeds max %u", - MAX_ACTIONS_BUFSIZE); - return ERR_PTR(-EMSGSIZE); - } - new_acts_size = MAX_ACTIONS_BUFSIZE; - } - acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) return ERR_CAST(acts); @@ -3545,7 +3532,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, int err; u32 mpls_label_count = 0; - *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); + *sfa = nla_alloc_flow_actions(nla_len(attr)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); -- cgit v1.2.3 From 1063ae07383c0ddc5bcce170260c143825846b03 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 8 Mar 2025 13:05:43 -0500 Subject: Revert "openvswitch: switch to per-action label counting in conntrack" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, ovs_ct_set_labels() is only called for confirmed conntrack entries (ct) within ovs_ct_commit(). However, if the conntrack entry does not have the labels_ext extension, attempting to allocate it in ovs_ct_get_conn_labels() for a confirmed entry triggers a warning in nf_ct_ext_add(): WARN_ON(nf_ct_is_confirmed(ct)); This happens when the conntrack entry is created externally before OVS increments net->ct.labels_used. The issue has become more likely since commit fcb1aa5163b1 ("openvswitch: switch to per-action label counting in conntrack"), which changed to use per-action label counting and increment net->ct.labels_used when a flow with ct action is added. Since there’s no straightforward way to fully resolve this issue at the moment, this reverts the commit to avoid breaking existing use cases. Fixes: fcb1aa5163b1 ("openvswitch: switch to per-action label counting in conntrack") Reported-by: Jianbo Liu Signed-off-by: Xin Long Acked-by: Aaron Conole Link: https://patch.msgid.link/1bdeb2f3a812bca016a225d3de714427b2cd4772.1741457143.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/openvswitch/conntrack.c | 30 ++++++++++++++++++------------ net/openvswitch/datapath.h | 3 +++ 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3bb4810234aa..e573e9221302 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1368,8 +1368,11 @@ bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) attr == OVS_KEY_ATTR_CT_MARK) return true; if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - attr == OVS_KEY_ATTR_CT_LABELS) - return true; + attr == OVS_KEY_ATTR_CT_LABELS) { + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + return ovs_net->xt_label; + } return false; } @@ -1378,7 +1381,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log) { - unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; struct ovs_conntrack_info ct_info; const char *helper = NULL; u16 family; @@ -1407,12 +1409,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, return -ENOMEM; } - if (nf_connlabels_get(net, n_bits - 1)) { - nf_ct_tmpl_free(ct_info.ct); - OVS_NLERR(log, "Failed to set connlabel length"); - return -EOPNOTSUPP; - } - if (ct_info.timeout[0]) { if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, ct_info.timeout)) @@ -1581,7 +1577,6 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) if (ct_info->ct) { if (ct_info->timeout[0]) nf_ct_destroy_timeout(ct_info->ct); - nf_connlabels_put(nf_ct_net(ct_info->ct)); nf_ct_tmpl_free(ct_info->ct); } } @@ -2006,9 +2001,17 @@ struct genl_family dp_ct_limit_genl_family __ro_after_init = { int ovs_ct_init(struct net *net) { -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) + unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + if (nf_connlabels_get(net, n_bits - 1)) { + ovs_net->xt_label = false; + OVS_NLERR(true, "Failed to set connlabel length"); + } else { + ovs_net->xt_label = true; + } + +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) return ovs_ct_limit_init(net, ovs_net); #else return 0; @@ -2017,9 +2020,12 @@ int ovs_ct_init(struct net *net) void ovs_ct_exit(struct net *net) { -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_net *ovs_net = net_generic(net, ovs_net_id); +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) ovs_ct_limit_exit(net, ovs_net); #endif + + if (ovs_net->xt_label) + nf_connlabels_put(net); } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 365b9bb7f546..9ca6231ea647 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -160,6 +160,9 @@ struct ovs_net { #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_ct_limit_info *ct_limit_info; #endif + + /* Module reference for configuring conntrack. */ + bool xt_label; }; /** -- cgit v1.2.3