summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/bridge/br_device.c2
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_multicast.c44
-rw-r--r--net/bridge/br_netfilter.c2
-rw-r--r--net/bridge/br_private.h5
-rw-r--r--net/bridge/netfilter/Kconfig1
-rw-r--r--net/bridge/netfilter/ebt_ulog.c9
-rw-r--r--net/bridge/netfilter/nf_tables_bridge.c41
-rw-r--r--net/core/flow_dissector.c2
-rw-r--r--net/core/netpoll.c31
-rw-r--r--net/core/skbuff.c14
-rw-r--r--net/ipv4/netfilter/arp_tables.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c110
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c7
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp_bic.c5
-rw-r--r--net/ipv4/tcp_cong.c47
-rw-r--r--net/ipv4/tcp_cubic.c5
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_highspeed.c4
-rw-r--r--net/ipv4/tcp_htcp.c4
-rw-r--r--net/ipv4/tcp_hybla.c5
-rw-r--r--net/ipv4/tcp_illinois.c5
-rw-r--r--net/ipv4/tcp_input.c40
-rw-r--r--net/ipv4/tcp_lp.c5
-rw-r--r--net/ipv4/tcp_offload.c13
-rw-r--r--net/ipv4/tcp_scalable.c5
-rw-r--r--net/ipv4/tcp_vegas.c11
-rw-r--r--net/ipv4/tcp_veno.c9
-rw-r--r--net/ipv4/tcp_yeah.c5
-rw-r--r--net/ipv4/xfrm4_policy.c8
-rw-r--r--net/ipv6/netfilter/ip6_tables.c5
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c7
-rw-r--r--net/ipv6/route.c9
-rw-r--r--net/ipv6/xfrm6_policy.c8
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h11
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c2
-rw-r--r--net/netfilter/ipset/ip_set_core.c70
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h21
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c22
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c22
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c48
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c39
-rw-r--r--net/netfilter/nf_conntrack_acct.c12
-rw-r--r--net/netfilter/nf_conntrack_core.c16
-rw-r--r--net/netfilter/nf_conntrack_netlink.c51
-rw-r--r--net/netfilter/nft_compat.c8
-rw-r--r--net/netfilter/nft_nat.c12
-rw-r--r--net/netfilter/x_tables.c7
-rw-r--r--net/netfilter/xt_NFQUEUE.c7
-rw-r--r--net/netfilter/xt_connbytes.c6
-rw-r--r--net/netfilter/xt_socket.c13
-rw-r--r--net/openvswitch/Makefile2
-rw-r--r--net/openvswitch/datapath.c668
-rw-r--r--net/openvswitch/datapath.h9
-rw-r--r--net/openvswitch/dp_notify.c7
-rw-r--r--net/openvswitch/flow.c1605
-rw-r--r--net/openvswitch/flow.h132
-rw-r--r--net/openvswitch/flow_netlink.c1630
-rw-r--r--net/openvswitch/flow_netlink.h60
-rw-r--r--net/openvswitch/flow_table.c592
-rw-r--r--net/openvswitch/flow_table.h81
-rw-r--r--net/openvswitch/vport-gre.c2
-rw-r--r--net/openvswitch/vport-internal_dev.c2
-rw-r--r--net/openvswitch/vport-netdev.c16
-rw-r--r--net/openvswitch/vport-netdev.h1
-rw-r--r--net/openvswitch/vport-vxlan.c1
-rw-r--r--net/sched/sch_fq.c1
-rw-r--r--net/sctp/ipv6.c4
-rw-r--r--net/sctp/sm_sideeffect.c1
-rw-r--r--net/x25/Kconfig4
-rw-r--r--net/xfrm/xfrm_ipcomp.c12
78 files changed, 3051 insertions, 2657 deletions
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ca04163635da..e6b7fecb3af1 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -64,7 +64,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_flood_deliver(br, skb, false);
goto out;
}
- if (br_multicast_rcv(br, NULL, skb)) {
+ if (br_multicast_rcv(br, NULL, skb, vid)) {
kfree_skb(skb);
goto out;
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index a2fd37ec35f7..7e73c32e205d 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -80,7 +80,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
br_fdb_update(br, p, eth_hdr(skb)->h_source, vid);
if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
- br_multicast_rcv(br, p, skb))
+ br_multicast_rcv(br, p, skb, vid))
goto drop;
if (p->state == BR_STATE_LEARNING)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 0513ef3ce667..4c214b2b88ef 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -947,7 +947,8 @@ void br_multicast_disable_port(struct net_bridge_port *port)
static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
struct igmpv3_report *ih;
struct igmpv3_grec *grec;
@@ -957,12 +958,10 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
int type;
int err = 0;
__be32 group;
- u16 vid = 0;
if (!pskb_may_pull(skb, sizeof(*ih)))
return -EINVAL;
- br_vlan_get_tag(skb, &vid);
ih = igmpv3_report_hdr(skb);
num = ntohs(ih->ngrec);
len = sizeof(*ih);
@@ -1005,7 +1004,8 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
#if IS_ENABLED(CONFIG_IPV6)
static int br_ip6_multicast_mld2_report(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
struct icmp6hdr *icmp6h;
struct mld2_grec *grec;
@@ -1013,12 +1013,10 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
int len;
int num;
int err = 0;
- u16 vid = 0;
if (!pskb_may_pull(skb, sizeof(*icmp6h)))
return -EINVAL;
- br_vlan_get_tag(skb, &vid);
icmp6h = icmp6_hdr(skb);
num = ntohs(icmp6h->icmp6_dataun.un_data16[1]);
len = sizeof(*icmp6h);
@@ -1141,7 +1139,8 @@ static void br_multicast_query_received(struct net_bridge *br,
static int br_ip4_multicast_query(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
const struct iphdr *iph = ip_hdr(skb);
struct igmphdr *ih = igmp_hdr(skb);
@@ -1153,7 +1152,6 @@ static int br_ip4_multicast_query(struct net_bridge *br,
unsigned long now = jiffies;
__be32 group;
int err = 0;
- u16 vid = 0;
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) ||
@@ -1189,7 +1187,6 @@ static int br_ip4_multicast_query(struct net_bridge *br,
if (!group)
goto out;
- br_vlan_get_tag(skb, &vid);
mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid);
if (!mp)
goto out;
@@ -1219,7 +1216,8 @@ out:
#if IS_ENABLED(CONFIG_IPV6)
static int br_ip6_multicast_query(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct mld_msg *mld;
@@ -1231,7 +1229,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
unsigned long now = jiffies;
const struct in6_addr *group = NULL;
int err = 0;
- u16 vid = 0;
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) ||
@@ -1265,7 +1262,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
if (!group)
goto out;
- br_vlan_get_tag(skb, &vid);
mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid);
if (!mp)
goto out;
@@ -1439,7 +1435,8 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
static int br_multicast_ipv4_rcv(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
struct sk_buff *skb2 = skb;
const struct iphdr *iph;
@@ -1447,7 +1444,6 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
unsigned int len;
unsigned int offset;
int err;
- u16 vid = 0;
/* We treat OOM as packet loss for now. */
if (!pskb_may_pull(skb, sizeof(*iph)))
@@ -1508,7 +1504,6 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
err = 0;
- br_vlan_get_tag(skb2, &vid);
BR_INPUT_SKB_CB(skb)->igmp = 1;
ih = igmp_hdr(skb2);
@@ -1519,10 +1514,10 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
err = br_ip4_multicast_add_group(br, port, ih->group, vid);
break;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
- err = br_ip4_multicast_igmp3_report(br, port, skb2);
+ err = br_ip4_multicast_igmp3_report(br, port, skb2, vid);
break;
case IGMP_HOST_MEMBERSHIP_QUERY:
- err = br_ip4_multicast_query(br, port, skb2);
+ err = br_ip4_multicast_query(br, port, skb2, vid);
break;
case IGMP_HOST_LEAVE_MESSAGE:
br_ip4_multicast_leave_group(br, port, ih->group, vid);
@@ -1540,7 +1535,8 @@ err_out:
#if IS_ENABLED(CONFIG_IPV6)
static int br_multicast_ipv6_rcv(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
struct sk_buff *skb2;
const struct ipv6hdr *ip6h;
@@ -1550,7 +1546,6 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
unsigned int len;
int offset;
int err;
- u16 vid = 0;
if (!pskb_may_pull(skb, sizeof(*ip6h)))
return -EINVAL;
@@ -1640,7 +1635,6 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
err = 0;
- br_vlan_get_tag(skb, &vid);
BR_INPUT_SKB_CB(skb)->igmp = 1;
switch (icmp6_type) {
@@ -1657,10 +1651,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
break;
}
case ICMPV6_MLD2_REPORT:
- err = br_ip6_multicast_mld2_report(br, port, skb2);
+ err = br_ip6_multicast_mld2_report(br, port, skb2, vid);
break;
case ICMPV6_MGM_QUERY:
- err = br_ip6_multicast_query(br, port, skb2);
+ err = br_ip6_multicast_query(br, port, skb2, vid);
break;
case ICMPV6_MGM_REDUCTION:
{
@@ -1681,7 +1675,7 @@ out:
#endif
int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb, u16 vid)
{
BR_INPUT_SKB_CB(skb)->igmp = 0;
BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
@@ -1691,10 +1685,10 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
switch (skb->protocol) {
case htons(ETH_P_IP):
- return br_multicast_ipv4_rcv(br, port, skb);
+ return br_multicast_ipv4_rcv(br, port, skb, vid);
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- return br_multicast_ipv6_rcv(br, port, skb);
+ return br_multicast_ipv6_rcv(br, port, skb, vid);
#endif
}
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 878f008afefa..80cad2cf02a7 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -559,6 +559,8 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
else if (skb->protocol == htons(ETH_P_PPP_SES))
nf_bridge->mask |= BRNF_PPPoE;
+ /* Must drop socket now because of tproxy. */
+ skb_orphan(skb);
return skb->dev;
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d1ca6d956633..229d820bdf0b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -435,7 +435,7 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
extern unsigned int br_mdb_rehash_seq;
int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
- struct sk_buff *skb);
+ struct sk_buff *skb, u16 vid);
struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
struct sk_buff *skb, u16 vid);
void br_multicast_add_port(struct net_bridge_port *port);
@@ -504,7 +504,8 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br,
#else
static inline int br_multicast_rcv(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
return 0;
}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 68f8128147be..5ca74a0e595f 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -3,6 +3,7 @@
#
#
config NF_TABLES_BRIDGE
+ depends on NF_TABLES
tristate "Ethernet Bridge nf_tables support"
menuconfig BRIDGE_NF_EBTABLES
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 518093802d1d..7c470c371e14 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -181,6 +181,7 @@ static void ebt_ulog_packet(struct net *net, unsigned int hooknr,
ub->qlen++;
pm = nlmsg_data(nlh);
+ memset(pm, 0, sizeof(*pm));
/* Fill in the ulog data */
pm->version = EBT_ULOG_VERSION;
@@ -193,8 +194,6 @@ static void ebt_ulog_packet(struct net *net, unsigned int hooknr,
pm->hook = hooknr;
if (uloginfo->prefix != NULL)
strcpy(pm->prefix, uloginfo->prefix);
- else
- *(pm->prefix) = '\0';
if (in) {
strcpy(pm->physindev, in->name);
@@ -204,16 +203,14 @@ static void ebt_ulog_packet(struct net *net, unsigned int hooknr,
strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name);
else
strcpy(pm->indev, in->name);
- } else
- pm->indev[0] = pm->physindev[0] = '\0';
+ }
if (out) {
/* If out exists, then out is a bridge port */
strcpy(pm->physoutdev, out->name);
/* rcu_read_lock()ed by nf_hook_slow */
strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name);
- } else
- pm->outdev[0] = pm->physoutdev[0] = '\0';
+ }
if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0)
BUG();
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index e8cb016fa34d..cf54b22818c8 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -47,14 +48,50 @@ static struct pernet_operations nf_tables_bridge_net_ops = {
.exit = nf_tables_bridge_exit_net,
};
+static unsigned int
+nft_do_chain_bridge(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo(&pkt, ops, skb, in, out);
+
+ return nft_do_chain_pktinfo(&pkt, ops);
+}
+
+static struct nf_chain_type filter_bridge = {
+ .family = NFPROTO_BRIDGE,
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .hook_mask = (1 << NF_BR_LOCAL_IN) |
+ (1 << NF_BR_FORWARD) |
+ (1 << NF_BR_LOCAL_OUT),
+ .fn = {
+ [NF_BR_LOCAL_IN] = nft_do_chain_bridge,
+ [NF_BR_FORWARD] = nft_do_chain_bridge,
+ [NF_BR_LOCAL_OUT] = nft_do_chain_bridge,
+ },
+};
+
static int __init nf_tables_bridge_init(void)
{
- return register_pernet_subsys(&nf_tables_bridge_net_ops);
+ int ret;
+
+ nft_register_chain_type(&filter_bridge);
+ ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_bridge);
+
+ return ret;
}
static void __exit nf_tables_bridge_exit(void)
{
- return unregister_pernet_subsys(&nf_tables_bridge_net_ops);
+ unregister_pernet_subsys(&nf_tables_bridge_net_ops);
+ nft_unregister_chain_type(&filter_bridge);
}
module_init(nf_tables_bridge_init);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5cac36e6ccd1..0242035192f1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -66,7 +66,7 @@ again:
struct iphdr _iph;
ip:
iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
- if (!iph)
+ if (!iph || iph->ihl < 5)
return false;
if (ip_is_fragment(iph))
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index fc75c9e461b8..8f971990677c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -636,8 +636,9 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo
netpoll_send_skb(np, send_skb);
- /* If there are several rx_hooks for the same address,
- we're fine by sending a single reply */
+ /* If there are several rx_skb_hooks for the same
+ * address we're fine by sending a single reply
+ */
break;
}
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
@@ -719,8 +720,9 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo
netpoll_send_skb(np, send_skb);
- /* If there are several rx_hooks for the same address,
- we're fine by sending a single reply */
+ /* If there are several rx_skb_hooks for the same
+ * address, we're fine by sending a single reply
+ */
break;
}
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
@@ -756,11 +758,12 @@ static bool pkt_is_ns(struct sk_buff *skb)
int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
{
- int proto, len, ulen;
- int hits = 0;
+ int proto, len, ulen, data_len;
+ int hits = 0, offset;
const struct iphdr *iph;
struct udphdr *uh;
struct netpoll *np, *tmp;
+ uint16_t source;
if (list_empty(&npinfo->rx_np))
goto out;
@@ -820,7 +823,10 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
len -= iph->ihl*4;
uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
+ offset = (unsigned char *)(uh + 1) - skb->data;
ulen = ntohs(uh->len);
+ data_len = skb->len - offset;
+ source = ntohs(uh->source);
if (ulen != len)
goto out;
@@ -834,9 +840,7 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
if (np->local_port && np->local_port != ntohs(uh->dest))
continue;
- np->rx_hook(np, ntohs(uh->source),
- (char *)(uh+1),
- ulen - sizeof(struct udphdr));
+ np->rx_skb_hook(np, source, skb, offset, data_len);
hits++;
}
} else {
@@ -859,7 +863,10 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
uh = udp_hdr(skb);
+ offset = (unsigned char *)(uh + 1) - skb->data;
ulen = ntohs(uh->len);
+ data_len = skb->len - offset;
+ source = ntohs(uh->source);
if (ulen != skb->len)
goto out;
if (udp6_csum_init(skb, uh, IPPROTO_UDP))
@@ -872,9 +879,7 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
if (np->local_port && np->local_port != ntohs(uh->dest))
continue;
- np->rx_hook(np, ntohs(uh->source),
- (char *)(uh+1),
- ulen - sizeof(struct udphdr));
+ np->rx_skb_hook(np, source, skb, offset, data_len);
hits++;
}
#endif
@@ -1062,7 +1067,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
npinfo->netpoll = np;
- if (np->rx_hook) {
+ if (np->rx_skb_hook) {
spin_lock_irqsave(&npinfo->rx_lock, flags);
npinfo->rx_flags |= NETPOLL_RX_ENABLED;
list_add_tail(&np->rx, &npinfo->rx_np);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 31aab5376cb4..3735fad5616e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -476,6 +476,18 @@ void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
}
EXPORT_SYMBOL(skb_add_rx_frag);
+void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
+ unsigned int truesize)
+{
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ skb_frag_size_add(frag, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
+}
+EXPORT_SYMBOL(skb_coalesce_rx_frag);
+
static void skb_drop_list(struct sk_buff **listp)
{
kfree_skb_list(*listp);
@@ -2003,7 +2015,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
int len, __wsum csum)
{
const struct skb_checksum_ops ops = {
- .update = csum_partial,
+ .update = csum_partial_ext,
.combine = csum_block_add_ext,
};
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 85a4f21aac1a..59da7cde0724 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -271,6 +271,11 @@ unsigned int arpt_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
table_base = private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d23118d95ff9..718dfbd30cbe 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -327,6 +327,11 @@ ipt_do_table(struct sk_buff *skb,
addend = xt_write_recseq_begin();
private = table->private;
cpu = smp_processor_id();
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
table_base = private->entries[cpu];
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
stackptr = per_cpu_ptr(private->stackptr, cpu);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index a2e2b61cd7da..2510c02c2d21 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -28,6 +28,7 @@
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/checksum.h>
#include <net/ip.h>
@@ -57,15 +58,21 @@ struct clusterip_config {
struct rcu_head rcu;
};
-static LIST_HEAD(clusterip_configs);
+#ifdef CONFIG_PROC_FS
+static const struct file_operations clusterip_proc_fops;
+#endif
-/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_SPINLOCK(clusterip_lock);
+static int clusterip_net_id __read_mostly;
+
+struct clusterip_net {
+ struct list_head configs;
+ /* lock protects the configs list */
+ spinlock_t lock;
#ifdef CONFIG_PROC_FS
-static const struct file_operations clusterip_proc_fops;
-static struct proc_dir_entry *clusterip_procdir;
+ struct proc_dir_entry *procdir;
#endif
+};
static inline void
clusterip_config_get(struct clusterip_config *c)
@@ -92,10 +99,13 @@ clusterip_config_put(struct clusterip_config *c)
static inline void
clusterip_config_entry_put(struct clusterip_config *c)
{
+ struct net *net = dev_net(c->dev);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
local_bh_disable();
- if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+ if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
list_del_rcu(&c->list);
- spin_unlock(&clusterip_lock);
+ spin_unlock(&cn->lock);
local_bh_enable();
dev_mc_del(c->dev, c->clustermac);
@@ -113,11 +123,12 @@ clusterip_config_entry_put(struct clusterip_config *c)
}
static struct clusterip_config *
-__clusterip_config_find(__be32 clusterip)
+__clusterip_config_find(struct net *net, __be32 clusterip)
{
struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
- list_for_each_entry_rcu(c, &clusterip_configs, list) {
+ list_for_each_entry_rcu(c, &cn->configs, list) {
if (c->clusterip == clusterip)
return c;
}
@@ -126,12 +137,12 @@ __clusterip_config_find(__be32 clusterip)
}
static inline struct clusterip_config *
-clusterip_config_find_get(__be32 clusterip, int entry)
+clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
{
struct clusterip_config *c;
rcu_read_lock_bh();
- c = __clusterip_config_find(clusterip);
+ c = __clusterip_config_find(net, clusterip);
if (c) {
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
@@ -158,6 +169,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
struct net_device *dev)
{
struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
@@ -180,7 +192,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
/* create proc dir entry */
sprintf(buffer, "%pI4", &ip);
c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
- clusterip_procdir,
+ cn->procdir,
&clusterip_proc_fops, c);
if (!c->pde) {
kfree(c);
@@ -189,9 +201,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
}
#endif
- spin_lock_bh(&clusterip_lock);
- list_add_rcu(&c->list, &clusterip_configs);
- spin_unlock_bh(&clusterip_lock);
+ spin_lock_bh(&cn->lock);
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
return c;
}
@@ -370,7 +382,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
/* FIXME: further sanity checks */
- config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
+ config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
if (!config) {
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
pr_info("no config found for %pI4, need 'new'\n",
@@ -384,7 +396,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- dev = dev_get_by_name(&init_net, e->ip.iniface);
+ dev = dev_get_by_name(par->net, e->ip.iniface);
if (!dev) {
pr_info("no such interface %s\n",
e->ip.iniface);
@@ -492,6 +504,7 @@ arp_mangle(const struct nf_hook_ops *ops,
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
+ struct net *net = dev_net(in ? in : out);
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -508,7 +521,7 @@ arp_mangle(const struct nf_hook_ops *ops,
/* if there is no clusterip configuration for the arp reply's
* source ip, we don't want to mangle it */
- c = clusterip_config_find_get(payload->src_ip, 0);
+ c = clusterip_config_find_get(net, payload->src_ip, 0);
if (!c)
return NF_ACCEPT;
@@ -698,48 +711,75 @@ static const struct file_operations clusterip_proc_fops = {
#endif /* CONFIG_PROC_FS */
+static int clusterip_net_init(struct net *net)
+{
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ INIT_LIST_HEAD(&cn->configs);
+
+ spin_lock_init(&cn->lock);
+
+#ifdef CONFIG_PROC_FS
+ cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
+ if (!cn->procdir) {
+ pr_err("Unable to proc dir entry\n");
+ return -ENOMEM;
+ }
+#endif /* CONFIG_PROC_FS */
+
+ return 0;
+}
+
+static void clusterip_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ proc_remove(cn->procdir);
+#endif
+}
+
+static struct pernet_operations clusterip_net_ops = {
+ .init = clusterip_net_init,
+ .exit = clusterip_net_exit,
+ .id = &clusterip_net_id,
+ .size = sizeof(struct clusterip_net),
+};
+
static int __init clusterip_tg_init(void)
{
int ret;
- ret = xt_register_target(&clusterip_tg_reg);
+ ret = register_pernet_subsys(&clusterip_net_ops);
if (ret < 0)
return ret;
+ ret = xt_register_target(&clusterip_tg_reg);
+ if (ret < 0)
+ goto cleanup_subsys;
+
ret = nf_register_hook(&cip_arp_ops);
if (ret < 0)
goto cleanup_target;
-#ifdef CONFIG_PROC_FS
- clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
- if (!clusterip_procdir) {
- pr_err("Unable to proc dir entry\n");
- ret = -ENOMEM;
- goto cleanup_hook;
- }
-#endif /* CONFIG_PROC_FS */
-
pr_info("ClusterIP Version %s loaded successfully\n",
CLUSTERIP_VERSION);
+
return 0;
-#ifdef CONFIG_PROC_FS
-cleanup_hook:
- nf_unregister_hook(&cip_arp_ops);
-#endif /* CONFIG_PROC_FS */
cleanup_target:
xt_unregister_target(&clusterip_tg_reg);
+cleanup_subsys:
+ unregister_pernet_subsys(&clusterip_net_ops);
return ret;
}
static void __exit clusterip_tg_exit(void)
{
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
-#ifdef CONFIG_PROC_FS
- proc_remove(clusterip_procdir);
-#endif
+
nf_unregister_hook(&cip_arp_ops);
xt_unregister_target(&clusterip_tg_reg);
+ unregister_pernet_subsys(&clusterip_net_ops);
/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
rcu_barrier_bh();
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index cbc22158af49..9cb993cd224b 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -220,6 +220,7 @@ static void ipt_ulog_packet(struct net *net,
ub->qlen++;
pm = nlmsg_data(nlh);
+ memset(pm, 0, sizeof(*pm));
/* We might not have a timestamp, get one */
if (skb->tstamp.tv64 == 0)
@@ -238,8 +239,6 @@ static void ipt_ulog_packet(struct net *net,
}
else if (loginfo->prefix[0] != '\0')
strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
- else
- *(pm->prefix) = '\0';
if (in && in->hard_header_len > 0 &&
skb->mac_header != skb->network_header &&
@@ -251,13 +250,9 @@ static void ipt_ulog_packet(struct net *net,
if (in)
strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
- else
- pm->indev_name[0] = '\0';
if (out)
strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
- else
- pm->outdev_name[0] = '\0';
/* copy_len <= skb->len, so can't fail. */
if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 8f7536be1322..0f4cbfeb19bd 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -16,7 +16,6 @@
#include <net/netfilter/nf_tables.h>
#include <net/net_namespace.h>
#include <net/ip.h>
-#include <net/net_namespace.h>
#include <net/netfilter/nf_tables_ipv4.h>
static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d5b1390eebbe..3d69ec8dac57 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -701,13 +701,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_allowed_congestion_control,
},
{
- .procname = "tcp_max_ssthresh",
- .data = &sysctl_tcp_max_ssthresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "tcp_thin_linear_timeouts",
.data = &sysctl_tcp_thin_linear_timeouts,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index f45e1c242440..821846fb0a7e 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -140,7 +140,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -149,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 019c2389a341..ad37bf18ae4b 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -15,8 +15,6 @@
#include <linux/gfp.h>
#include <net/tcp.h>
-int sysctl_tcp_max_ssthresh = 0;
-
static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
@@ -299,35 +297,24 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
}
EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
-/*
- * Slow start is used when congestion window is less than slow start
- * threshold. This version implements the basic RFC2581 version
- * and optionally supports:
- * RFC3742 Limited Slow Start - growth limited to max_ssthresh
- * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-void tcp_slow_start(struct tcp_sock *tp)
+int tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- int cnt; /* increase in packets */
- unsigned int delta = 0;
- u32 snd_cwnd = tp->snd_cwnd;
-
- if (unlikely(!snd_cwnd)) {
- pr_err_once("snd_cwnd is nul, please report this bug.\n");
- snd_cwnd = 1U;
- }
+ u32 cwnd = tp->snd_cwnd + acked;
- if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
- cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
- else
- cnt = snd_cwnd; /* exponential increase */
-
- tp->snd_cwnd_cnt += cnt;
- while (tp->snd_cwnd_cnt >= snd_cwnd) {
- tp->snd_cwnd_cnt -= snd_cwnd;
- delta++;
- }
- tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp);
+ if (cwnd > tp->snd_ssthresh)
+ cwnd = tp->snd_ssthresh + 1;
+ acked -= cwnd - tp->snd_cwnd;
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+ return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
@@ -351,7 +338,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -360,7 +347,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
/* In dangerous area, increase slowly. */
else
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index b6ae92a51f58..828e4c3ffbaf 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -304,7 +304,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -315,7 +316,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (tp->snd_cwnd <= tp->snd_ssthresh) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 766032b4a6c3..f195d9316e55 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -8,7 +8,7 @@
#include <net/inetpeer.h>
#include <net/tcp.h>
-int sysctl_tcp_fastopen __read_mostly;
+int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 30f27f6b3655..8ed9305dfdf4 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,7 +109,7 @@ static void hstcp_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
@@ -118,7 +118,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
/* Update AIMD parameters.
*
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index c1a8175361e8..4a194acfd923 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -227,7 +227,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
@@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 57bdd17dff4d..478fe82611bf 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -85,7 +85,8 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
@@ -102,7 +103,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (!ca->hybla_en) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
return;
}
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 834857f3c871..8a520996f3d2 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -256,7 +256,8 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
/*
* Increase window in response to successful acknowledgment.
*/
-static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
@@ -270,7 +271,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* In slow start */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
u32 delta;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b935397c703c..c53b7f35c51d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2903,7 +2903,8 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* left edge of the send window.
* See draft-ietf-tcplw-high-performance-00, section 3.3.
*/
- if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ flag & FLAG_ACKED)
seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
if (seq_rtt < 0)
@@ -2918,20 +2919,25 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
}
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
-static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
+static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
{
struct tcp_sock *tp = tcp_sk(sk);
s32 seq_rtt = -1;
- if (tp->lsndtime && !tp->total_retrans)
- seq_rtt = tcp_time_stamp - tp->lsndtime;
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
+ if (synack_stamp && !tp->total_retrans)
+ seq_rtt = tcp_time_stamp - synack_stamp;
+
+ /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
+ * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
+ */
+ if (!tp->srtt)
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
}
-static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -3028,6 +3034,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
s32 seq_rtt = -1;
s32 ca_seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
+ bool rtt_update;
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -3104,14 +3111,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
- if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
- (flag & FLAG_ACKED))
- tcp_rearm_rto(sk);
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt);
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
+ tcp_rearm_rto(sk);
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
@@ -3150,6 +3156,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
}
+ } else if (skb && rtt_update && sack_rtt >= 0 &&
+ sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
+ /* Do not re-arm RTO if the sack RTT is measured from data sent
+ * after when the head was last (re)transmitted. Otherwise the
+ * timeout may continue to extend in loss recovery.
+ */
+ tcp_rearm_rto(sk);
}
#if FASTRETRANS_DEBUG > 0
@@ -3441,7 +3454,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* Advance cwnd if state allows */
if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, prior_in_flight);
+ tcp_cong_avoid(sk, ack, acked, prior_in_flight);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -5626,6 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct request_sock *req;
int queued = 0;
bool acceptable;
+ u32 synack_stamp;
tp->rx_opt.saw_tstamp = 0;
@@ -5708,9 +5722,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* so release it.
*/
if (req) {
+ synack_stamp = tcp_rsk(req)->snt_synack;
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
+ synack_stamp = tp->lsndtime;
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);
@@ -5733,7 +5749,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_synack_rtt_meas(sk, req);
+ tcp_synack_rtt_meas(sk, synack_stamp);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 72f7218b03f5..991d62a2f9bb 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -115,12 +115,13 @@ static void tcp_lp_init(struct sock *sk)
* Will only call newReno CA when away from inference.
* From TCP-LP's paper, this will be handled in additive increasement.
*/
-static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct lp *lp = inet_csk_ca(sk);
if (!(lp->flag & LP_WITHIN_INF))
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
}
/**
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index a7a5583eab04..a2b68a108eae 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -18,6 +18,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int sum_truesize = 0;
struct tcphdr *th;
unsigned int thlen;
unsigned int seq;
@@ -104,13 +105,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (copy_destructor) {
skb->destructor = gso_skb->destructor;
skb->sk = gso_skb->sk;
- /* {tcp|sock}_wfree() use exact truesize accounting :
- * sum(skb->truesize) MUST be exactly be gso_skb->truesize
- * So we account mss bytes of 'true size' for each segment.
- * The last segment will contain the remaining.
- */
- skb->truesize = mss;
- gso_skb->truesize -= mss;
+ sum_truesize += skb->truesize;
}
skb = skb->next;
th = tcp_hdr(skb);
@@ -127,7 +122,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (copy_destructor) {
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
- swap(gso_skb->truesize, skb->truesize);
+ sum_truesize += skb->truesize;
+ atomic_add(sum_truesize - gso_skb->truesize,
+ &skb->sk->sk_wmem_alloc);
}
delta = htonl(oldlen + (skb_tail_pointer(skb) -
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 8ce55b8aaec8..19ea6c2951f3 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,7 +15,8 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -23,7 +24,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 80fa2bfd7ede..06cae62bf208 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -163,13 +163,14 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
return min(tp->snd_ssthresh, tp->snd_cwnd-1);
}
-static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct vegas *vegas = inet_csk_ca(sk);
if (!vegas->doing_vegas_now) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
return;
}
@@ -194,7 +195,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
} else {
u32 rtt, diff;
u64 target_cwnd;
@@ -243,7 +244,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
/* Congestion avoidance. */
@@ -283,7 +284,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
}
/* Use normal slow start */
else if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
}
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index ac43cd747bce..326475a94865 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,13 +114,14 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
tcp_veno_init(sk);
}
-static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
if (!veno->doing_veno_now) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
return;
}
@@ -133,7 +134,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* We don't have enough rtt samples to do the Veno
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
} else {
u64 target_cwnd;
u32 rtt;
@@ -152,7 +153,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
/* Congestion avoidance. */
if (veno->diff < beta) {
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 05c3b6f0e8e1..a347a078ee07 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -69,7 +69,8 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
}
-static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct yeah *yeah = inet_csk_ca(sk);
@@ -78,7 +79,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else if (!yeah->doing_reno_now) {
/* Scalable */
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index ccde54248c8c..e1a63930a967 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -104,10 +104,14 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
const struct iphdr *iph = ip_hdr(skb);
u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
struct flowi4 *fl4 = &fl->u.ip4;
+ int oif = 0;
+
+ if (skb_dst(skb))
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
- fl4->flowi4_oif = skb_dst(skb)->dev->ifindex;
+ fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
if (!ip_is_fragment(iph)) {
switch (iph->protocol) {
@@ -236,7 +240,7 @@ static struct dst_ops xfrm4_dst_ops = {
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = 1024,
+ .gc_thresh = 32768,
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 44400c216dc6..710238f58aa9 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -349,6 +349,11 @@ ip6t_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
cpu = smp_processor_id();
table_base = private->entries[cpu];
jumpstack = (struct ip6t_entry **)private->jumpstack[cpu];
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 56eef30ee5f6..da00a2ecde55 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -39,7 +39,7 @@ MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6");
MODULE_LICENSE("GPL");
/* Send RST reply */
-static void send_reset(struct net *net, struct sk_buff *oldskb)
+static void send_reset(struct net *net, struct sk_buff *oldskb, int hook)
{
struct sk_buff *nskb;
struct tcphdr otcph, *tcph;
@@ -88,8 +88,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb)
}
/* Check checksum. */
- if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
- skb_checksum(oldskb, tcphoff, otcplen, 0))) {
+ if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) {
pr_debug("TCP checksum is invalid\n");
return;
}
@@ -227,7 +226,7 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
/* Do nothing */
break;
case IP6T_TCP_RESET:
- send_reset(net, skb);
+ send_reset(net, skb, par->hooknum);
break;
default:
net_info_ratelimited("case %u not handled yet\n", reject->with);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1ac0b6e17d95..fd399ac6c1f7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1087,10 +1087,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev)))
return NULL;
- if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
- return dst;
+ if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+ return NULL;
- return NULL;
+ if (rt6_check_expired(rt))
+ return NULL;
+
+ return dst;
}
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 08ed2772b7aa..5f8e128c512d 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -135,10 +135,14 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
struct ipv6_opt_hdr *exthdr;
const unsigned char *nh = skb_network_header(skb);
u8 nexthdr = nh[IP6CB(skb)->nhoff];
+ int oif = 0;
+
+ if (skb_dst(skb))
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl6, 0, sizeof(struct flowi6));
fl6->flowi6_mark = skb->mark;
- fl6->flowi6_oif = skb_dst(skb)->dev->ifindex;
+ fl6->flowi6_oif = reverse ? skb->skb_iif : oif;
fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
@@ -285,7 +289,7 @@ static struct dst_ops xfrm6_dst_ops = {
.destroy = xfrm6_dst_destroy,
.ifdown = xfrm6_dst_ifdown,
.local_out = __ip6_local_out,
- .gc_thresh = 1024,
+ .gc_thresh = 32768,
};
static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index a13e15be7911..f2c7d83dc23f 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -198,13 +198,14 @@ mtype_list(const struct ip_set *set,
struct mtype *map = set->data;
struct nlattr *adt, *nested;
void *x;
- u32 id, first = cb->args[2];
+ u32 id, first = cb->args[IPSET_CB_ARG0];
adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!adt)
return -EMSGSIZE;
- for (; cb->args[2] < map->elements; cb->args[2]++) {
- id = cb->args[2];
+ for (; cb->args[IPSET_CB_ARG0] < map->elements;
+ cb->args[IPSET_CB_ARG0]++) {
+ id = cb->args[IPSET_CB_ARG0];
x = get_ext(set, map, id);
if (!test_bit(id, map->members) ||
(SET_WITH_TIMEOUT(set) &&
@@ -231,14 +232,14 @@ mtype_list(const struct ip_set *set,
ipset_nest_end(skb, adt);
/* Set listing finished */
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return 0;
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(id == first)) {
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return -EMSGSIZE;
}
ipset_nest_end(skb, adt);
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index e7603c5b53d7..cf99676e69f8 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -254,7 +254,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
return -ENOMEM;
map->elements = last_port - first_port + 1;
- map->memsize = map->elements * sizeof(unsigned long);
+ map->memsize = bitmap_bytes(0, map->elements);
set->variant = &bitmap_port;
set->dsize = ip_set_elem_len(set, tb, 0);
if (!init_map_port(set, map, first_port, last_port)) {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index dc9284bdd2dd..bac7e01df67f 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1182,10 +1182,12 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
static int
ip_set_dump_done(struct netlink_callback *cb)
{
- struct ip_set_net *inst = (struct ip_set_net *)cb->data;
- if (cb->args[2]) {
- pr_debug("release set %s\n", nfnl_set(inst, cb->args[1])->name);
- __ip_set_put_byindex(inst, (ip_set_id_t) cb->args[1]);
+ struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
+ if (cb->args[IPSET_CB_ARG0]) {
+ pr_debug("release set %s\n",
+ nfnl_set(inst, cb->args[IPSET_CB_INDEX])->name);
+ __ip_set_put_byindex(inst,
+ (ip_set_id_t) cb->args[IPSET_CB_INDEX]);
}
return 0;
}
@@ -1203,7 +1205,7 @@ dump_attrs(struct nlmsghdr *nlh)
}
static int
-dump_init(struct netlink_callback *cb)
+dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
{
struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
@@ -1211,15 +1213,15 @@ dump_init(struct netlink_callback *cb)
struct nlattr *attr = (void *)nlh + min_len;
u32 dump_type;
ip_set_id_t index;
- struct ip_set_net *inst = (struct ip_set_net *)cb->data;
/* Second pass, so parser can't fail */
nla_parse(cda, IPSET_ATTR_CMD_MAX,
attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
- /* cb->args[0] : dump single set/all sets
- * [1] : set index
- * [..]: type specific
+ /* cb->args[IPSET_CB_NET]: net namespace
+ * [IPSET_CB_DUMP]: dump single set/all sets
+ * [IPSET_CB_INDEX]: set index
+ * [IPSET_CB_ARG0]: type specific
*/
if (cda[IPSET_ATTR_SETNAME]) {
@@ -1231,7 +1233,7 @@ dump_init(struct netlink_callback *cb)
return -ENOENT;
dump_type = DUMP_ONE;
- cb->args[1] = index;
+ cb->args[IPSET_CB_INDEX] = index;
} else
dump_type = DUMP_ALL;
@@ -1239,7 +1241,8 @@ dump_init(struct netlink_callback *cb)
u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
dump_type |= (f << 16);
}
- cb->args[0] = dump_type;
+ cb->args[IPSET_CB_NET] = (unsigned long)inst;
+ cb->args[IPSET_CB_DUMP] = dump_type;
return 0;
}
@@ -1251,12 +1254,12 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
struct ip_set *set = NULL;
struct nlmsghdr *nlh = NULL;
unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
+ struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));
u32 dump_type, dump_flags;
int ret = 0;
- struct ip_set_net *inst = (struct ip_set_net *)cb->data;
- if (!cb->args[0]) {
- ret = dump_init(cb);
+ if (!cb->args[IPSET_CB_DUMP]) {
+ ret = dump_init(cb, inst);
if (ret < 0) {
nlh = nlmsg_hdr(cb->skb);
/* We have to create and send the error message
@@ -1267,17 +1270,18 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
}
}
- if (cb->args[1] >= inst->ip_set_max)
+ if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max)
goto out;
- dump_type = DUMP_TYPE(cb->args[0]);
- dump_flags = DUMP_FLAGS(cb->args[0]);
- max = dump_type == DUMP_ONE ? cb->args[1] + 1 : inst->ip_set_max;
+ dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]);
+ dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]);
+ max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1
+ : inst->ip_set_max;
dump_last:
- pr_debug("args[0]: %u %u args[1]: %ld\n",
- dump_type, dump_flags, cb->args[1]);
- for (; cb->args[1] < max; cb->args[1]++) {
- index = (ip_set_id_t) cb->args[1];
+ pr_debug("dump type, flag: %u %u index: %ld\n",
+ dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
+ for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
+ index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
set = nfnl_set(inst, index);
if (set == NULL) {
if (dump_type == DUMP_ONE) {
@@ -1294,7 +1298,7 @@ dump_last:
!!(set->type->features & IPSET_DUMP_LAST)))
continue;
pr_debug("List set: %s\n", set->name);
- if (!cb->args[2]) {
+ if (!cb->args[IPSET_CB_ARG0]) {
/* Start listing: make sure set won't be destroyed */
pr_debug("reference set\n");
__ip_set_get(set);
@@ -1311,7 +1315,7 @@ dump_last:
goto nla_put_failure;
if (dump_flags & IPSET_FLAG_LIST_SETNAME)
goto next_set;
- switch (cb->args[2]) {
+ switch (cb->args[IPSET_CB_ARG0]) {
case 0:
/* Core header data */
if (nla_put_string(skb, IPSET_ATTR_TYPENAME,
@@ -1331,7 +1335,7 @@ dump_last:
read_lock_bh(&set->lock);
ret = set->variant->list(set, skb, cb);
read_unlock_bh(&set->lock);
- if (!cb->args[2])
+ if (!cb->args[IPSET_CB_ARG0])
/* Set is done, proceed with next one */
goto next_set;
goto release_refcount;
@@ -1340,8 +1344,8 @@ dump_last:
/* If we dump all sets, continue with dumping last ones */
if (dump_type == DUMP_ALL) {
dump_type = DUMP_LAST;
- cb->args[0] = dump_type | (dump_flags << 16);
- cb->args[1] = 0;
+ cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16);
+ cb->args[IPSET_CB_INDEX] = 0;
goto dump_last;
}
goto out;
@@ -1350,15 +1354,15 @@ nla_put_failure:
ret = -EFAULT;
next_set:
if (dump_type == DUMP_ONE)
- cb->args[1] = IPSET_INVALID_ID;
+ cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID;
else
- cb->args[1]++;
+ cb->args[IPSET_CB_INDEX]++;
release_refcount:
/* If there was an error or set is done, release set */
- if (ret || !cb->args[2]) {
+ if (ret || !cb->args[IPSET_CB_ARG0]) {
pr_debug("release set %s\n", nfnl_set(inst, index)->name);
__ip_set_put_byindex(inst, index);
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
}
out:
if (nlh) {
@@ -1375,8 +1379,6 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
-
if (unlikely(protocol_failed(attr)))
return -IPSET_ERR_PROTOCOL;
@@ -1384,7 +1386,6 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
struct netlink_dump_control c = {
.dump = ip_set_dump_start,
.done = ip_set_dump_done,
- .data = (void *)inst
};
return netlink_dump_start(ctnl, skb, nlh, &c);
}
@@ -1961,7 +1962,6 @@ static int __net_init
ip_set_net_init(struct net *net)
{
struct ip_set_net *inst = ip_set_pernet(net);
-
struct ip_set **list;
inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX;
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 6a80dbd30df7..be6932ad3a86 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -234,7 +234,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
#define mtype MTYPE
-#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
#define mtype_add IPSET_TOKEN(MTYPE, _add)
#define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
@@ -931,7 +930,7 @@ mtype_list(const struct ip_set *set,
struct nlattr *atd, *nested;
const struct hbucket *n;
const struct mtype_elem *e;
- u32 first = cb->args[2];
+ u32 first = cb->args[IPSET_CB_ARG0];
/* We assume that one hash bucket fills into one page */
void *incomplete;
int i;
@@ -940,20 +939,22 @@ mtype_list(const struct ip_set *set,
if (!atd)
return -EMSGSIZE;
pr_debug("list hash set %s\n", set->name);
- for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
+ for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
+ cb->args[IPSET_CB_ARG0]++) {
incomplete = skb_tail_pointer(skb);
- n = hbucket(t, cb->args[2]);
- pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n);
+ n = hbucket(t, cb->args[IPSET_CB_ARG0]);
+ pr_debug("cb->arg bucket: %lu, t %p n %p\n",
+ cb->args[IPSET_CB_ARG0], t, n);
for (i = 0; i < n->pos; i++) {
e = ahash_data(n, i, set->dsize);
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
pr_debug("list hash %lu hbucket %p i %u, data %p\n",
- cb->args[2], n, i, e);
+ cb->args[IPSET_CB_ARG0], n, i, e);
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested) {
- if (cb->args[2] == first) {
+ if (cb->args[IPSET_CB_ARG0] == first) {
nla_nest_cancel(skb, atd);
return -EMSGSIZE;
} else
@@ -968,16 +969,16 @@ mtype_list(const struct ip_set *set,
}
ipset_nest_end(skb, atd);
/* Set listing finished */
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return 0;
nla_put_failure:
nlmsg_trim(skb, incomplete);
- if (unlikely(first == cb->args[2])) {
+ if (unlikely(first == cb->args[IPSET_CB_ARG0])) {
pr_warning("Can't list set %s: one bucket does not fit into "
"a message. Please report it!\n", set->name);
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return -EMSGSIZE;
}
ipset_nest_end(skb, atd);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 426032706ca9..2bc2dec20b00 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -137,12 +137,11 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netnet4_elem e = {
- .cidr[0] = h->nets[0].cidr[0] ? h->nets[0].cidr[0] : HOST_MASK,
- .cidr[1] = h->nets[0].cidr[1] ? h->nets[0].cidr[1] : HOST_MASK,
- };
+ struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
@@ -160,14 +159,14 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netnet4_elem e = { .cidr[0] = HOST_MASK,
- .cidr[1] = HOST_MASK };
+ struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, last;
u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
u8 cidr, cidr2;
int ret;
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
@@ -364,12 +363,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netnet6_elem e = {
- .cidr[0] = h->nets[0].cidr[0] ? h->nets[0].cidr[0] : HOST_MASK,
- .cidr[1] = h->nets[0].cidr[1] ? h->nets[0].cidr[1] : HOST_MASK
- };
+ struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
@@ -386,11 +384,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netnet6_elem e = { .cidr[0] = HOST_MASK,
- .cidr[1] = HOST_MASK };
+ struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 363fab933d48..703d1192a6a2 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -147,12 +147,11 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netportnet4_elem e = {
- .cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
- .cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK),
- };
+ struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
@@ -174,8 +173,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netportnet4_elem e = { .cidr[0] = HOST_MASK,
- .cidr[1] = HOST_MASK };
+ struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
@@ -183,6 +181,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr, cidr2;
int ret;
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
@@ -419,12 +418,11 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netportnet6_elem e = {
- .cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
- .cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK),
- };
+ struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
@@ -446,13 +444,13 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netportnet6_elem e = { .cidr[0] = HOST_MASK,
- .cidr[1] = HOST_MASK };
+ struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 port, port_to;
bool with_ports = false;
int ret;
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index ec6f6d15dded..3e2317f3cf68 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -490,14 +490,15 @@ list_set_list(const struct ip_set *set,
{
const struct list_set *map = set->data;
struct nlattr *atd, *nested;
- u32 i, first = cb->args[2];
+ u32 i, first = cb->args[IPSET_CB_ARG0];
const struct set_elem *e;
atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
- for (; cb->args[2] < map->size; cb->args[2]++) {
- i = cb->args[2];
+ for (; cb->args[IPSET_CB_ARG0] < map->size;
+ cb->args[IPSET_CB_ARG0]++) {
+ i = cb->args[IPSET_CB_ARG0];
e = list_set_elem(set, map, i);
if (e->id == IPSET_INVALID_ID)
goto finish;
@@ -522,13 +523,13 @@ list_set_list(const struct ip_set *set,
finish:
ipset_nest_end(skb, atd);
/* Set listing finished */
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return 0;
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(i == first)) {
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return -EMSGSIZE;
}
ipset_nest_end(skb, atd);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a3df9bddc4f7..62786a495cea 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -704,7 +704,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
__ip_vs_dst_cache_reset(dest);
__ip_vs_svc_put(svc, false);
free_percpu(dest->stats.cpustats);
- kfree(dest);
+ ip_vs_dest_put_and_free(dest);
}
/*
@@ -3820,10 +3820,6 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- /* Some dest can be in grace period even before cleanup, we have to
- * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called.
- */
- rcu_barrier();
ip_vs_trash_cleanup(net);
ip_vs_stop_estimator(net, &ipvs->tot_stats);
ip_vs_control_net_cleanup_sysctl(net);
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index eff13c94498e..ca056a331e60 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -136,7 +136,7 @@ static void ip_vs_lblc_rcu_free(struct rcu_head *head)
struct ip_vs_lblc_entry,
rcu_head);
- ip_vs_dest_put(en->dest);
+ ip_vs_dest_put_and_free(en->dest);
kfree(en);
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 0b8550089a2e..3f21a2f47de1 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -130,7 +130,7 @@ static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)
struct ip_vs_dest_set_elem *e;
e = container_of(head, struct ip_vs_dest_set_elem, rcu_head);
- ip_vs_dest_put(e->dest);
+ ip_vs_dest_put_and_free(e->dest);
kfree(e);
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 23e596e438b3..2f7ea7564044 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -20,13 +20,18 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
sctp_sctphdr_t *sh, _sctph;
sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
- if (sh == NULL)
+ if (sh == NULL) {
+ *verdict = NF_DROP;
return 0;
+ }
sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
sizeof(_schunkh), &_schunkh);
- if (sch == NULL)
+ if (sch == NULL) {
+ *verdict = NF_DROP;
return 0;
+ }
+
net = skb_net(skb);
ipvs = net_ipvs(net);
rcu_read_lock();
@@ -76,6 +81,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
{
sctp_sctphdr_t *sctph;
unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6 && iph->fragoffs)
@@ -87,19 +93,31 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_out(cp, skb))
+ ret = ip_vs_app_pkt_out(cp, skb);
+ if (ret == 0)
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
}
sctph = (void *) skb_network_header(skb) + sctphoff;
- sctph->source = cp->vport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ /* Only update csum if we really have to */
+ if (sctph->source != cp->vport || payload_csum ||
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ sctph->source = cp->vport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
return 1;
}
@@ -110,6 +128,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
{
sctp_sctphdr_t *sctph;
unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6 && iph->fragoffs)
@@ -121,19 +140,32 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_in(cp, skb))
+ ret = ip_vs_app_pkt_in(cp, skb);
+ if (ret == 0)
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
}
sctph = (void *) skb_network_header(skb) + sctphoff;
- sctph->dest = cp->dport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ /* Only update csum if we really have to */
+ if (sctph->dest != cp->dport || payload_csum ||
+ (skb->ip_summed == CHECKSUM_PARTIAL &&
+ !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) {
+ sctph->dest = cp->dport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
return 1;
}
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 3588faebe529..cc65b2f42cd4 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -115,27 +115,46 @@ ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
}
-/* As ip_vs_sh_get, but with fallback if selected server is unavailable */
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable
+ *
+ * The fallback strategy loops around the table starting from a "random"
+ * point (in fact, it is chosen to be the original hash value to make the
+ * algorithm deterministic) to find a new server.
+ */
static inline struct ip_vs_dest *
ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
const union nf_inet_addr *addr, __be16 port)
{
- unsigned int offset;
- unsigned int hash;
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
struct ip_vs_dest *dest;
+ /* first try the dest it's supposed to go to */
+ ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ dest = rcu_dereference(s->buckets[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
+
+ /* if the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
- hash = ip_vs_sh_hashkey(svc->af, addr, port, offset);
+ roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE;
+ hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);
dest = rcu_dereference(s->buckets[hash].dest);
if (!dest)
break;
- if (is_unavailable(dest))
- IP_VS_DBG_BUF(6, "SH: selected unavailable server "
- "%s:%d (offset %d)",
- IP_VS_DBG_ADDR(svc->af, &dest->addr),
- ntohs(dest->port), offset);
- else
+ if (!is_unavailable(dest))
return dest;
+ IP_VS_DBG_BUF(6, "SH: selected unavailable "
+ "server %s:%d (offset %d), reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ ntohs(dest->port), roffset);
}
return NULL;
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 2d3030ab5b61..a4b5e2a435ac 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -39,21 +39,23 @@ static struct ctl_table acct_sysctl_table[] = {
unsigned int
seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
{
- struct nf_conn_counter *acct;
+ struct nf_conn_acct *acct;
+ struct nf_conn_counter *counter;
acct = nf_conn_acct_find(ct);
if (!acct)
return 0;
+ counter = acct->counter;
return seq_printf(s, "packets=%llu bytes=%llu ",
- (unsigned long long)atomic64_read(&acct[dir].packets),
- (unsigned long long)atomic64_read(&acct[dir].bytes));
+ (unsigned long long)atomic64_read(&counter[dir].packets),
+ (unsigned long long)atomic64_read(&counter[dir].bytes));
};
EXPORT_SYMBOL_GPL(seq_print_acct);
static struct nf_ct_ext_type acct_extend __read_mostly = {
- .len = sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]),
- .align = __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]),
+ .len = sizeof(struct nf_conn_acct),
+ .align = __alignof__(struct nf_conn_acct),
.id = NF_CT_EXT_ACCT,
};
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5d892febd64c..e22d950c60b3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1109,12 +1109,14 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
acct:
if (do_acct) {
- struct nf_conn_counter *acct;
+ struct nf_conn_acct *acct;
acct = nf_conn_acct_find(ct);
if (acct) {
- atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len, &acct[CTINFO2DIR(ctinfo)].bytes);
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
}
}
}
@@ -1126,13 +1128,15 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
int do_acct)
{
if (do_acct) {
- struct nf_conn_counter *acct;
+ struct nf_conn_acct *acct;
acct = nf_conn_acct_find(ct);
if (acct) {
- atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets);
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
atomic64_add(skb->len - skb_network_offset(skb),
- &acct[CTINFO2DIR(ctinfo)].bytes);
+ &counter[CTINFO2DIR(ctinfo)].bytes);
}
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index eea936b70d15..08870b859046 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -211,13 +211,23 @@ nla_put_failure:
}
static int
-dump_counters(struct sk_buff *skb, u64 pkts, u64 bytes,
- enum ip_conntrack_dir dir)
+dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct,
+ enum ip_conntrack_dir dir, int type)
{
- enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ struct nf_conn_counter *counter = acct->counter;
struct nlattr *nest_count;
+ u64 pkts, bytes;
- nest_count = nla_nest_start(skb, type | NLA_F_NESTED);
+ if (type == IPCTNL_MSG_CT_GET_CTRZERO) {
+ pkts = atomic64_xchg(&counter[dir].packets, 0);
+ bytes = atomic64_xchg(&counter[dir].bytes, 0);
+ } else {
+ pkts = atomic64_read(&counter[dir].packets);
+ bytes = atomic64_read(&counter[dir].bytes);
+ }
+
+ nest_count = nla_nest_start(skb, attr | NLA_F_NESTED);
if (!nest_count)
goto nla_put_failure;
@@ -234,24 +244,19 @@ nla_put_failure:
}
static int
-ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct,
- enum ip_conntrack_dir dir, int type)
+ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type)
{
- struct nf_conn_counter *acct;
- u64 pkts, bytes;
+ struct nf_conn_acct *acct = nf_conn_acct_find(ct);
- acct = nf_conn_acct_find(ct);
if (!acct)
return 0;
- if (type == IPCTNL_MSG_CT_GET_CTRZERO) {
- pkts = atomic64_xchg(&acct[dir].packets, 0);
- bytes = atomic64_xchg(&acct[dir].bytes, 0);
- } else {
- pkts = atomic64_read(&acct[dir].packets);
- bytes = atomic64_read(&acct[dir].bytes);
- }
- return dump_counters(skb, pkts, bytes, dir);
+ if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0)
+ return -1;
+ if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0)
+ return -1;
+
+ return 0;
}
static int
@@ -488,8 +493,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
if (ctnetlink_dump_status(skb, ct) < 0 ||
ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, type) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, type) < 0 ||
+ ctnetlink_dump_acct(skb, ct, type) < 0 ||
ctnetlink_dump_timestamp(skb, ct) < 0 ||
ctnetlink_dump_protoinfo(skb, ct) < 0 ||
ctnetlink_dump_helpinfo(skb, ct) < 0 ||
@@ -530,7 +534,7 @@ ctnetlink_proto_size(const struct nf_conn *ct)
}
static inline size_t
-ctnetlink_counters_size(const struct nf_conn *ct)
+ctnetlink_acct_size(const struct nf_conn *ct)
{
if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
return 0;
@@ -579,7 +583,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
- + ctnetlink_counters_size(ct)
+ + ctnetlink_acct_size(ct)
+ ctnetlink_timestamp_size(ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ nla_total_size(0) /* CTA_PROTOINFO */
@@ -673,10 +677,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
if (events & (1 << IPCT_DESTROY)) {
- if (ctnetlink_dump_counters(skb, ct,
- IP_CT_DIR_ORIGINAL, type) < 0 ||
- ctnetlink_dump_counters(skb, ct,
- IP_CT_DIR_REPLY, type) < 0 ||
+ if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
ctnetlink_dump_timestamp(skb, ct) < 0)
goto nla_put_failure;
} else {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 4811f762e060..a82667c64729 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -634,9 +634,9 @@ nft_match_select_ops(const struct nft_ctx *ctx,
static void nft_match_release(void)
{
- struct nft_xt *nft_match;
+ struct nft_xt *nft_match, *tmp;
- list_for_each_entry(nft_match, &nft_match_list, head)
+ list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head)
kfree(nft_match);
}
@@ -705,9 +705,9 @@ nft_target_select_ops(const struct nft_ctx *ctx,
static void nft_target_release(void)
{
- struct nft_xt *nft_target;
+ struct nft_xt *nft_target, *tmp;
- list_for_each_entry(nft_target, &nft_target_list, head)
+ list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head)
kfree(nft_target);
}
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index b0b87b2d2411..d3b1ffe26181 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -47,8 +47,10 @@ static void nft_nat_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
if (priv->sreg_addr_min) {
if (priv->family == AF_INET) {
- range.min_addr.ip = data[priv->sreg_addr_min].data[0];
- range.max_addr.ip = data[priv->sreg_addr_max].data[0];
+ range.min_addr.ip = (__force __be32)
+ data[priv->sreg_addr_min].data[0];
+ range.max_addr.ip = (__force __be32)
+ data[priv->sreg_addr_max].data[0];
} else {
memcpy(range.min_addr.ip6,
@@ -62,8 +64,10 @@ static void nft_nat_eval(const struct nft_expr *expr,
}
if (priv->sreg_proto_min) {
- range.min_proto.all = data[priv->sreg_proto_min].data[0];
- range.max_proto.all = data[priv->sreg_proto_max].data[0];
+ range.min_proto.all = (__force __be16)
+ data[priv->sreg_proto_min].data[0];
+ range.max_proto.all = (__force __be16)
+ data[priv->sreg_proto_max].data[0];
range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8b03028cca69..227aa11e8409 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -845,8 +845,13 @@ xt_replace_table(struct xt_table *table,
return NULL;
}
- table->private = newinfo;
newinfo->initial_entries = private->initial_entries;
+ /*
+ * Ensure contents of newinfo are visible before assigning to
+ * private.
+ */
+ smp_wmb();
+ table->private = newinfo;
/*
* Even though table entries have now been swapped, other CPU's
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 1e2fae32f81b..ed00fef58996 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -147,6 +147,7 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_NFQ_info_v3 *info = par->targinfo;
u32 queue = info->queuenum;
+ int ret;
if (info->queues_total > 1) {
if (info->flags & NFQ_FLAG_CPU_FANOUT) {
@@ -157,7 +158,11 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
queue = nfqueue_hash(skb, par);
}
- return NF_QUEUE_NR(queue);
+ ret = NF_QUEUE_NR(queue);
+ if (info->flags & NFQ_FLAG_BYPASS)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+ return ret;
}
static struct xt_target nfqueue_tg_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index e595e07a759b..1e634615ab9d 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -26,16 +26,18 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)
u_int64_t what = 0; /* initialize to make gcc happy */
u_int64_t bytes = 0;
u_int64_t pkts = 0;
+ const struct nf_conn_acct *acct;
const struct nf_conn_counter *counters;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return false;
- counters = nf_conn_acct_find(ct);
- if (!counters)
+ acct = nf_conn_acct_find(ct);
+ if (!acct)
return false;
+ counters = acct->counter;
switch (sinfo->what) {
case XT_CONNBYTES_PKTS:
switch (sinfo->direction) {
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 3dd0e374bc2b..1ba67931eb1b 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -35,15 +35,6 @@
#include <net/netfilter/nf_conntrack.h>
#endif
-static void
-xt_socket_put_sk(struct sock *sk)
-{
- if (sk->sk_state == TCP_TIME_WAIT)
- inet_twsk_put(inet_twsk(sk));
- else
- sock_put(sk);
-}
-
static int
extract_icmp4_fields(const struct sk_buff *skb,
u8 *protocol,
@@ -216,7 +207,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
inet_twsk(sk)->tw_transparent));
if (sk != skb->sk)
- xt_socket_put_sk(sk);
+ sock_gen_put(sk);
if (wildcard || !transparent)
sk = NULL;
@@ -381,7 +372,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
inet_twsk(sk)->tw_transparent));
if (sk != skb->sk)
- xt_socket_put_sk(sk);
+ sock_gen_put(sk);
if (wildcard || !transparent)
sk = NULL;
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index ea36e99089af..3591cb5dae91 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -9,6 +9,8 @@ openvswitch-y := \
datapath.o \
dp_notify.o \
flow.o \
+ flow_netlink.o \
+ flow_table.o \
vport.o \
vport-internal_dev.o \
vport-netdev.o
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2aa13bd7f2b2..1408adc2a2a7 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -55,14 +55,10 @@
#include "datapath.h"
#include "flow.h"
+#include "flow_netlink.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
-
-#define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
-static void rehash_flow_table(struct work_struct *work);
-static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
-
int ovs_net_id __read_mostly;
static void ovs_notify(struct sk_buff *skb, struct genl_info *info,
@@ -165,7 +161,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
{
struct datapath *dp = container_of(rcu, struct datapath, rcu);
- ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false);
+ ovs_flow_tbl_destroy(&dp->table);
free_percpu(dp->stats_percpu);
release_net(ovs_dp_get_net(dp));
kfree(dp->ports);
@@ -225,6 +221,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
struct dp_stats_percpu *stats;
struct sw_flow_key key;
u64 *stats_counter;
+ u32 n_mask_hit;
int error;
stats = this_cpu_ptr(dp->stats_percpu);
@@ -237,7 +234,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
}
/* Look up flow. */
- flow = ovs_flow_lookup(rcu_dereference(dp->table), &key);
+ flow = ovs_flow_tbl_lookup(&dp->table, &key, &n_mask_hit);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
@@ -262,6 +259,7 @@ out:
/* Update datapath statistics. */
u64_stats_update_begin(&stats->sync);
(*stats_counter)++;
+ stats->n_mask_hit += n_mask_hit;
u64_stats_update_end(&stats->sync);
}
@@ -435,7 +433,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
upcall->dp_ifindex = dp_ifindex;
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
- ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb);
+ ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb);
nla_nest_end(user_skb, nla);
if (upcall_info->userdata)
@@ -455,398 +453,6 @@ out:
return err;
}
-/* Called with ovs_mutex. */
-static int flush_flows(struct datapath *dp)
-{
- struct flow_table *old_table;
- struct flow_table *new_table;
-
- old_table = ovsl_dereference(dp->table);
- new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
- if (!new_table)
- return -ENOMEM;
-
- rcu_assign_pointer(dp->table, new_table);
-
- ovs_flow_tbl_destroy(old_table, true);
- return 0;
-}
-
-static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, int attr_len)
-{
-
- struct sw_flow_actions *acts;
- int new_acts_size;
- int req_size = NLA_ALIGN(attr_len);
- int next_offset = offsetof(struct sw_flow_actions, actions) +
- (*sfa)->actions_len;
-
- if (req_size <= (ksize(*sfa) - next_offset))
- goto out;
-
- new_acts_size = ksize(*sfa) * 2;
-
- if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
- if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
- return ERR_PTR(-EMSGSIZE);
- new_acts_size = MAX_ACTIONS_BUFSIZE;
- }
-
- acts = ovs_flow_actions_alloc(new_acts_size);
- if (IS_ERR(acts))
- return (void *)acts;
-
- memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
- acts->actions_len = (*sfa)->actions_len;
- kfree(*sfa);
- *sfa = acts;
-
-out:
- (*sfa)->actions_len += req_size;
- return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
-}
-
-static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
-{
- struct nlattr *a;
-
- a = reserve_sfa_size(sfa, nla_attr_size(len));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- a->nla_type = attrtype;
- a->nla_len = nla_attr_size(len);
-
- if (data)
- memcpy(nla_data(a), data, len);
- memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
-
- return 0;
-}
-
-static inline int add_nested_action_start(struct sw_flow_actions **sfa, int attrtype)
-{
- int used = (*sfa)->actions_len;
- int err;
-
- err = add_action(sfa, attrtype, NULL, 0);
- if (err)
- return err;
-
- return used;
-}
-
-static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_offset)
-{
- struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + st_offset);
-
- a->nla_len = sfa->actions_len - st_offset;
-}
-
-static int validate_and_copy_actions(const struct nlattr *attr,
- const struct sw_flow_key *key, int depth,
- struct sw_flow_actions **sfa);
-
-static int validate_and_copy_sample(const struct nlattr *attr,
- const struct sw_flow_key *key, int depth,
- struct sw_flow_actions **sfa)
-{
- const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
- const struct nlattr *probability, *actions;
- const struct nlattr *a;
- int rem, start, err, st_acts;
-
- memset(attrs, 0, sizeof(attrs));
- nla_for_each_nested(a, attr, rem) {
- int type = nla_type(a);
- if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
- return -EINVAL;
- attrs[type] = a;
- }
- if (rem)
- return -EINVAL;
-
- probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
- if (!probability || nla_len(probability) != sizeof(u32))
- return -EINVAL;
-
- actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
- if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
- return -EINVAL;
-
- /* validation done, copy sample action. */
- start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE);
- if (start < 0)
- return start;
- err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, nla_data(probability), sizeof(u32));
- if (err)
- return err;
- st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS);
- if (st_acts < 0)
- return st_acts;
-
- err = validate_and_copy_actions(actions, key, depth + 1, sfa);
- if (err)
- return err;
-
- add_nested_action_end(*sfa, st_acts);
- add_nested_action_end(*sfa, start);
-
- return 0;
-}
-
-static int validate_tp_port(const struct sw_flow_key *flow_key)
-{
- if (flow_key->eth.type == htons(ETH_P_IP)) {
- if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst)
- return 0;
- } else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
- if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst)
- return 0;
- }
-
- return -EINVAL;
-}
-
-static int validate_and_copy_set_tun(const struct nlattr *attr,
- struct sw_flow_actions **sfa)
-{
- struct sw_flow_match match;
- struct sw_flow_key key;
- int err, start;
-
- ovs_match_init(&match, &key, NULL);
- err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false);
- if (err)
- return err;
-
- start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
- if (start < 0)
- return start;
-
- err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
- sizeof(match.key->tun_key));
- add_nested_action_end(*sfa, start);
-
- return err;
-}
-
-static int validate_set(const struct nlattr *a,
- const struct sw_flow_key *flow_key,
- struct sw_flow_actions **sfa,
- bool *set_tun)
-{
- const struct nlattr *ovs_key = nla_data(a);
- int key_type = nla_type(ovs_key);
-
- /* There can be only one key in a action */
- if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
- return -EINVAL;
-
- if (key_type > OVS_KEY_ATTR_MAX ||
- (ovs_key_lens[key_type] != nla_len(ovs_key) &&
- ovs_key_lens[key_type] != -1))
- return -EINVAL;
-
- switch (key_type) {
- const struct ovs_key_ipv4 *ipv4_key;
- const struct ovs_key_ipv6 *ipv6_key;
- int err;
-
- case OVS_KEY_ATTR_PRIORITY:
- case OVS_KEY_ATTR_SKB_MARK:
- case OVS_KEY_ATTR_ETHERNET:
- break;
-
- case OVS_KEY_ATTR_TUNNEL:
- *set_tun = true;
- err = validate_and_copy_set_tun(a, sfa);
- if (err)
- return err;
- break;
-
- case OVS_KEY_ATTR_IPV4:
- if (flow_key->eth.type != htons(ETH_P_IP))
- return -EINVAL;
-
- if (!flow_key->ip.proto)
- return -EINVAL;
-
- ipv4_key = nla_data(ovs_key);
- if (ipv4_key->ipv4_proto != flow_key->ip.proto)
- return -EINVAL;
-
- if (ipv4_key->ipv4_frag != flow_key->ip.frag)
- return -EINVAL;
-
- break;
-
- case OVS_KEY_ATTR_IPV6:
- if (flow_key->eth.type != htons(ETH_P_IPV6))
- return -EINVAL;
-
- if (!flow_key->ip.proto)
- return -EINVAL;
-
- ipv6_key = nla_data(ovs_key);
- if (ipv6_key->ipv6_proto != flow_key->ip.proto)
- return -EINVAL;
-
- if (ipv6_key->ipv6_frag != flow_key->ip.frag)
- return -EINVAL;
-
- if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
- return -EINVAL;
-
- break;
-
- case OVS_KEY_ATTR_TCP:
- if (flow_key->ip.proto != IPPROTO_TCP)
- return -EINVAL;
-
- return validate_tp_port(flow_key);
-
- case OVS_KEY_ATTR_UDP:
- if (flow_key->ip.proto != IPPROTO_UDP)
- return -EINVAL;
-
- return validate_tp_port(flow_key);
-
- case OVS_KEY_ATTR_SCTP:
- if (flow_key->ip.proto != IPPROTO_SCTP)
- return -EINVAL;
-
- return validate_tp_port(flow_key);
-
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int validate_userspace(const struct nlattr *attr)
-{
- static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
- [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
- [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
- };
- struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
- int error;
-
- error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
- attr, userspace_policy);
- if (error)
- return error;
-
- if (!a[OVS_USERSPACE_ATTR_PID] ||
- !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
- return -EINVAL;
-
- return 0;
-}
-
-static int copy_action(const struct nlattr *from,
- struct sw_flow_actions **sfa)
-{
- int totlen = NLA_ALIGN(from->nla_len);
- struct nlattr *to;
-
- to = reserve_sfa_size(sfa, from->nla_len);
- if (IS_ERR(to))
- return PTR_ERR(to);
-
- memcpy(to, from, totlen);
- return 0;
-}
-
-static int validate_and_copy_actions(const struct nlattr *attr,
- const struct sw_flow_key *key,
- int depth,
- struct sw_flow_actions **sfa)
-{
- const struct nlattr *a;
- int rem, err;
-
- if (depth >= SAMPLE_ACTION_DEPTH)
- return -EOVERFLOW;
-
- nla_for_each_nested(a, attr, rem) {
- /* Expected argument lengths, (u32)-1 for variable length. */
- static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
- [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
- [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
- [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
- [OVS_ACTION_ATTR_POP_VLAN] = 0,
- [OVS_ACTION_ATTR_SET] = (u32)-1,
- [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
- };
- const struct ovs_action_push_vlan *vlan;
- int type = nla_type(a);
- bool skip_copy;
-
- if (type > OVS_ACTION_ATTR_MAX ||
- (action_lens[type] != nla_len(a) &&
- action_lens[type] != (u32)-1))
- return -EINVAL;
-
- skip_copy = false;
- switch (type) {
- case OVS_ACTION_ATTR_UNSPEC:
- return -EINVAL;
-
- case OVS_ACTION_ATTR_USERSPACE:
- err = validate_userspace(a);
- if (err)
- return err;
- break;
-
- case OVS_ACTION_ATTR_OUTPUT:
- if (nla_get_u32(a) >= DP_MAX_PORTS)
- return -EINVAL;
- break;
-
-
- case OVS_ACTION_ATTR_POP_VLAN:
- break;
-
- case OVS_ACTION_ATTR_PUSH_VLAN:
- vlan = nla_data(a);
- if (vlan->vlan_tpid != htons(ETH_P_8021Q))
- return -EINVAL;
- if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
- return -EINVAL;
- break;
-
- case OVS_ACTION_ATTR_SET:
- err = validate_set(a, key, sfa, &skip_copy);
- if (err)
- return err;
- break;
-
- case OVS_ACTION_ATTR_SAMPLE:
- err = validate_and_copy_sample(a, key, depth, sfa);
- if (err)
- return err;
- skip_copy = true;
- break;
-
- default:
- return -EINVAL;
- }
- if (!skip_copy) {
- err = copy_action(a, sfa);
- if (err)
- return err;
- }
- }
-
- if (rem > 0)
- return -EINVAL;
-
- return 0;
-}
-
static void clear_stats(struct sw_flow *flow)
{
flow->used = 0;
@@ -902,15 +508,16 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_flow_free;
- err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]);
+ err = ovs_nla_get_flow_metadata(flow, a[OVS_PACKET_ATTR_KEY]);
if (err)
goto err_flow_free;
- acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
+ acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
err = PTR_ERR(acts);
if (IS_ERR(acts))
goto err_flow_free;
- err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts);
+ err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
+ &flow->key, 0, &acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (err)
goto err_flow_free;
@@ -958,15 +565,18 @@ static struct genl_ops dp_packet_genl_ops[] = {
}
};
-static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
+static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats,
+ struct ovs_dp_megaflow_stats *mega_stats)
{
- struct flow_table *table;
int i;
- table = rcu_dereference_check(dp->table, lockdep_ovsl_is_held());
- stats->n_flows = ovs_flow_tbl_count(table);
+ memset(mega_stats, 0, sizeof(*mega_stats));
+
+ stats->n_flows = ovs_flow_tbl_count(&dp->table);
+ mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
stats->n_hit = stats->n_missed = stats->n_lost = 0;
+
for_each_possible_cpu(i) {
const struct dp_stats_percpu *percpu_stats;
struct dp_stats_percpu local_stats;
@@ -982,6 +592,7 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
stats->n_hit += local_stats.n_hit;
stats->n_missed += local_stats.n_missed;
stats->n_lost += local_stats.n_lost;
+ mega_stats->n_mask_hit += local_stats.n_mask_hit;
}
}
@@ -1005,100 +616,6 @@ static struct genl_multicast_group ovs_dp_flow_multicast_group = {
.name = OVS_FLOW_MCGROUP
};
-static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb);
-static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
-{
- const struct nlattr *a;
- struct nlattr *start;
- int err = 0, rem;
-
- start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
- if (!start)
- return -EMSGSIZE;
-
- nla_for_each_nested(a, attr, rem) {
- int type = nla_type(a);
- struct nlattr *st_sample;
-
- switch (type) {
- case OVS_SAMPLE_ATTR_PROBABILITY:
- if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, sizeof(u32), nla_data(a)))
- return -EMSGSIZE;
- break;
- case OVS_SAMPLE_ATTR_ACTIONS:
- st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
- if (!st_sample)
- return -EMSGSIZE;
- err = actions_to_attr(nla_data(a), nla_len(a), skb);
- if (err)
- return err;
- nla_nest_end(skb, st_sample);
- break;
- }
- }
-
- nla_nest_end(skb, start);
- return err;
-}
-
-static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
-{
- const struct nlattr *ovs_key = nla_data(a);
- int key_type = nla_type(ovs_key);
- struct nlattr *start;
- int err;
-
- switch (key_type) {
- case OVS_KEY_ATTR_IPV4_TUNNEL:
- start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
- if (!start)
- return -EMSGSIZE;
-
- err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
- nla_data(ovs_key));
- if (err)
- return err;
- nla_nest_end(skb, start);
- break;
- default:
- if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
- return -EMSGSIZE;
- break;
- }
-
- return 0;
-}
-
-static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb)
-{
- const struct nlattr *a;
- int rem, err;
-
- nla_for_each_attr(a, attr, len, rem) {
- int type = nla_type(a);
-
- switch (type) {
- case OVS_ACTION_ATTR_SET:
- err = set_action_to_attr(a, skb);
- if (err)
- return err;
- break;
-
- case OVS_ACTION_ATTR_SAMPLE:
- err = sample_action_to_attr(a, skb);
- if (err)
- return err;
- break;
- default:
- if (nla_put(skb, type, nla_len(a), nla_data(a)))
- return -EMSGSIZE;
- break;
- }
- }
-
- return 0;
-}
-
static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
{
return NLMSG_ALIGN(sizeof(struct ovs_header))
@@ -1135,8 +652,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
if (!nla)
goto nla_put_failure;
- err = ovs_flow_to_nlattrs(&flow->unmasked_key,
- &flow->unmasked_key, skb);
+ err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb);
if (err)
goto error;
nla_nest_end(skb, nla);
@@ -1145,7 +661,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
if (!nla)
goto nla_put_failure;
- err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb);
+ err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb);
if (err)
goto error;
@@ -1155,7 +671,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
used = flow->used;
stats.n_packets = flow->packet_count;
stats.n_bytes = flow->byte_count;
- tcp_flags = flow->tcp_flags;
+ tcp_flags = (u8)ntohs(flow->tcp_flags);
spin_unlock_bh(&flow->lock);
if (used &&
@@ -1188,7 +704,8 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
sf_acts = rcu_dereference_check(flow->sf_acts,
lockdep_ovsl_is_held());
- err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb);
+ err = ovs_nla_put_actions(sf_acts->actions,
+ sf_acts->actions_len, skb);
if (!err)
nla_nest_end(skb, start);
else {
@@ -1234,6 +751,14 @@ static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
return skb;
}
+static struct sw_flow *__ovs_flow_tbl_lookup(struct flow_table *tbl,
+ const struct sw_flow_key *key)
+{
+ u32 __always_unused n_mask_hit;
+
+ return ovs_flow_tbl_lookup(tbl, key, &n_mask_hit);
+}
+
static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
@@ -1243,7 +768,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
- struct flow_table *table;
struct sw_flow_actions *acts = NULL;
struct sw_flow_match match;
int error;
@@ -1254,21 +778,21 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
goto error;
ovs_match_init(&match, &key, &mask);
- error = ovs_match_from_nlattrs(&match,
- a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
+ error = ovs_nla_get_match(&match,
+ a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
if (error)
goto error;
/* Validate actions. */
if (a[OVS_FLOW_ATTR_ACTIONS]) {
- acts = ovs_flow_actions_alloc(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
+ acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
error = PTR_ERR(acts);
if (IS_ERR(acts))
goto error;
- ovs_flow_key_mask(&masked_key, &key, &mask);
- error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
- &masked_key, 0, &acts);
+ ovs_flow_mask_key(&masked_key, &key, &mask);
+ error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
+ &masked_key, 0, &acts);
if (error) {
OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
goto err_kfree;
@@ -1284,29 +808,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
if (!dp)
goto err_unlock_ovs;
- table = ovsl_dereference(dp->table);
-
/* Check if this is a duplicate flow */
- flow = ovs_flow_lookup(table, &key);
+ flow = __ovs_flow_tbl_lookup(&dp->table, &key);
if (!flow) {
- struct sw_flow_mask *mask_p;
/* Bail out if we're not allowed to create a new flow. */
error = -ENOENT;
if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
goto err_unlock_ovs;
- /* Expand table, if necessary, to make room. */
- if (ovs_flow_tbl_need_to_expand(table)) {
- struct flow_table *new_table;
-
- new_table = ovs_flow_tbl_expand(table);
- if (!IS_ERR(new_table)) {
- rcu_assign_pointer(dp->table, new_table);
- ovs_flow_tbl_destroy(table, true);
- table = ovsl_dereference(dp->table);
- }
- }
-
/* Allocate flow. */
flow = ovs_flow_alloc();
if (IS_ERR(flow)) {
@@ -1317,25 +826,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
flow->key = masked_key;
flow->unmasked_key = key;
-
- /* Make sure mask is unique in the system */
- mask_p = ovs_sw_flow_mask_find(table, &mask);
- if (!mask_p) {
- /* Allocate a new mask if none exsits. */
- mask_p = ovs_sw_flow_mask_alloc();
- if (!mask_p)
- goto err_flow_free;
- mask_p->key = mask.key;
- mask_p->range = mask.range;
- ovs_sw_flow_mask_insert(table, mask_p);
- }
-
- ovs_sw_flow_mask_add_ref(mask_p);
- flow->mask = mask_p;
rcu_assign_pointer(flow->sf_acts, acts);
/* Put flow in bucket. */
- ovs_flow_insert(table, flow);
+ error = ovs_flow_tbl_insert(&dp->table, flow, &mask);
+ if (error) {
+ acts = NULL;
+ goto err_flow_free;
+ }
reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
info->snd_seq, OVS_FLOW_CMD_NEW);
@@ -1356,7 +854,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
/* The unmasked key has to be the same for flow updates. */
error = -EINVAL;
- if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) {
+ if (!ovs_flow_cmp_unmasked_key(flow, &match)) {
OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n");
goto err_unlock_ovs;
}
@@ -1364,7 +862,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
/* Update actions. */
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
- ovs_flow_deferred_free_acts(old_acts);
+ ovs_nla_free_flow_actions(old_acts);
reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
info->snd_seq, OVS_FLOW_CMD_NEW);
@@ -1403,7 +901,6 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct sw_flow *flow;
struct datapath *dp;
- struct flow_table *table;
struct sw_flow_match match;
int err;
@@ -1413,7 +910,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
}
ovs_match_init(&match, &key, NULL);
- err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
+ err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
if (err)
return err;
@@ -1424,9 +921,8 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
goto unlock;
}
- table = ovsl_dereference(dp->table);
- flow = ovs_flow_lookup_unmasked_key(table, &match);
- if (!flow) {
+ flow = __ovs_flow_tbl_lookup(&dp->table, &key);
+ if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) {
err = -ENOENT;
goto unlock;
}
@@ -1453,7 +949,6 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct sw_flow *flow;
struct datapath *dp;
- struct flow_table *table;
struct sw_flow_match match;
int err;
@@ -1465,18 +960,17 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
}
if (!a[OVS_FLOW_ATTR_KEY]) {
- err = flush_flows(dp);
+ err = ovs_flow_tbl_flush(&dp->table);
goto unlock;
}
ovs_match_init(&match, &key, NULL);
- err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
+ err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
if (err)
goto unlock;
- table = ovsl_dereference(dp->table);
- flow = ovs_flow_lookup_unmasked_key(table, &match);
- if (!flow) {
+ flow = __ovs_flow_tbl_lookup(&dp->table, &key);
+ if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) {
err = -ENOENT;
goto unlock;
}
@@ -1487,7 +981,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
goto unlock;
}
- ovs_flow_remove(table, flow);
+ ovs_flow_tbl_remove(&dp->table, flow);
err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_FLOW_CMD_DEL);
@@ -1506,8 +1000,8 @@ unlock:
static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
+ struct table_instance *ti;
struct datapath *dp;
- struct flow_table *table;
rcu_read_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
@@ -1516,14 +1010,14 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
return -ENODEV;
}
- table = rcu_dereference(dp->table);
+ ti = rcu_dereference(dp->table.ti);
for (;;) {
struct sw_flow *flow;
u32 bucket, obj;
bucket = cb->args[0];
obj = cb->args[1];
- flow = ovs_flow_dump_next(table, &bucket, &obj);
+ flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj);
if (!flow)
break;
@@ -1589,6 +1083,7 @@ static size_t ovs_dp_cmd_msg_size(void)
msgsize += nla_total_size(IFNAMSIZ);
msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
+ msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
return msgsize;
}
@@ -1598,6 +1093,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
{
struct ovs_header *ovs_header;
struct ovs_dp_stats dp_stats;
+ struct ovs_dp_megaflow_stats dp_megaflow_stats;
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
@@ -1613,8 +1109,14 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
if (err)
goto nla_put_failure;
- get_dp_stats(dp, &dp_stats);
- if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats))
+ get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
+ if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
+ &dp_stats))
+ goto nla_put_failure;
+
+ if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
+ sizeof(struct ovs_dp_megaflow_stats),
+ &dp_megaflow_stats))
goto nla_put_failure;
return genlmsg_end(skb, ovs_header);
@@ -1687,9 +1189,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
ovs_dp_set_net(dp, hold_net(sock_net(skb->sk)));
/* Allocate table. */
- err = -ENOMEM;
- rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS));
- if (!dp->table)
+ err = ovs_flow_tbl_init(&dp->table);
+ if (err)
goto err_free_dp;
dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
@@ -1699,7 +1200,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
- GFP_KERNEL);
+ GFP_KERNEL);
if (!dp->ports) {
err = -ENOMEM;
goto err_destroy_percpu;
@@ -1746,7 +1247,7 @@ err_destroy_ports_array:
err_destroy_percpu:
free_percpu(dp->stats_percpu);
err_destroy_table:
- ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false);
+ ovs_flow_tbl_destroy(&dp->table);
err_free_dp:
release_net(ovs_dp_get_net(dp));
kfree(dp);
@@ -2336,32 +1837,6 @@ error:
return err;
}
-static void rehash_flow_table(struct work_struct *work)
-{
- struct datapath *dp;
- struct net *net;
-
- ovs_lock();
- rtnl_lock();
- for_each_net(net) {
- struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
-
- list_for_each_entry(dp, &ovs_net->dps, list_node) {
- struct flow_table *old_table = ovsl_dereference(dp->table);
- struct flow_table *new_table;
-
- new_table = ovs_flow_tbl_rehash(old_table);
- if (!IS_ERR(new_table)) {
- rcu_assign_pointer(dp->table, new_table);
- ovs_flow_tbl_destroy(old_table, true);
- }
- }
- }
- rtnl_unlock();
- ovs_unlock();
- schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
-}
-
static int __net_init ovs_init_net(struct net *net)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
@@ -2419,8 +1894,6 @@ static int __init dp_init(void)
if (err < 0)
goto error_unreg_notifier;
- schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
-
return 0;
error_unreg_notifier:
@@ -2437,7 +1910,6 @@ error:
static void dp_cleanup(void)
{
- cancel_delayed_work_sync(&rehash_flow_wq);
dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
unregister_netdevice_notifier(&ovs_dp_device_notifier);
unregister_pernet_device(&ovs_net_ops);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 4d109c176ef3..d3d14a58aa91 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -27,6 +27,7 @@
#include <linux/u64_stats_sync.h>
#include "flow.h"
+#include "flow_table.h"
#include "vport.h"
#define DP_MAX_PORTS USHRT_MAX
@@ -45,11 +46,15 @@
* @n_lost: Number of received packets that had no matching flow in the flow
* table that could not be sent to userspace (normally due to an overflow in
* one of the datapath's queues).
+ * @n_mask_hit: Number of masks looked up for flow match.
+ * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked
+ * up per packet.
*/
struct dp_stats_percpu {
u64 n_hit;
u64 n_missed;
u64 n_lost;
+ u64 n_mask_hit;
struct u64_stats_sync sync;
};
@@ -57,7 +62,7 @@ struct dp_stats_percpu {
* struct datapath - datapath for flow-based packet switching
* @rcu: RCU callback head for deferred destruction.
* @list_node: Element in global 'dps' list.
- * @table: Current flow table. Protected by ovs_mutex and RCU.
+ * @table: flow table.
* @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by
* ovs_mutex and RCU.
* @stats_percpu: Per-CPU datapath statistics.
@@ -71,7 +76,7 @@ struct datapath {
struct list_head list_node;
/* Flow table. */
- struct flow_table __rcu *table;
+ struct flow_table table;
/* Switch ports. */
struct hlist_head *ports;
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index c3235675f359..5c2dab276109 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -65,8 +65,7 @@ void ovs_dp_notify_wq(struct work_struct *work)
continue;
netdev_vport = netdev_vport_priv(vport);
- if (netdev_vport->dev->reg_state == NETREG_UNREGISTERED ||
- netdev_vport->dev->reg_state == NETREG_UNREGISTERING)
+ if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH))
dp_detach_port_notify(vport);
}
}
@@ -88,6 +87,10 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event,
return NOTIFY_DONE;
if (event == NETDEV_UNREGISTER) {
+ /* upper_dev_unlink and decrement promisc immediately */
+ ovs_netdev_detach_dev(vport);
+
+ /* schedule vport destroy, dev_put and genl notification */
ovs_net = net_generic(dev_net(dev), ovs_net_id);
queue_work(system_wq, &ovs_net->dp_notify_work);
}
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 410db90db73d..b409f5279601 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -45,202 +45,38 @@
#include <net/ipv6.h>
#include <net/ndisc.h>
-static struct kmem_cache *flow_cache;
-
-static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
- struct sw_flow_key_range *range, u8 val);
-
-static void update_range__(struct sw_flow_match *match,
- size_t offset, size_t size, bool is_mask)
+u64 ovs_flow_used_time(unsigned long flow_jiffies)
{
- struct sw_flow_key_range *range = NULL;
- size_t start = rounddown(offset, sizeof(long));
- size_t end = roundup(offset + size, sizeof(long));
-
- if (!is_mask)
- range = &match->range;
- else if (match->mask)
- range = &match->mask->range;
-
- if (!range)
- return;
-
- if (range->start == range->end) {
- range->start = start;
- range->end = end;
- return;
- }
-
- if (range->start > start)
- range->start = start;
+ struct timespec cur_ts;
+ u64 cur_ms, idle_ms;
- if (range->end < end)
- range->end = end;
-}
+ ktime_get_ts(&cur_ts);
+ idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
+ cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
+ cur_ts.tv_nsec / NSEC_PER_MSEC;
-#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
- do { \
- update_range__(match, offsetof(struct sw_flow_key, field), \
- sizeof((match)->key->field), is_mask); \
- if (is_mask) { \
- if ((match)->mask) \
- (match)->mask->key.field = value; \
- } else { \
- (match)->key->field = value; \
- } \
- } while (0)
-
-#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
- do { \
- update_range__(match, offsetof(struct sw_flow_key, field), \
- len, is_mask); \
- if (is_mask) { \
- if ((match)->mask) \
- memcpy(&(match)->mask->key.field, value_p, len);\
- } else { \
- memcpy(&(match)->key->field, value_p, len); \
- } \
- } while (0)
-
-static u16 range_n_bytes(const struct sw_flow_key_range *range)
-{
- return range->end - range->start;
+ return cur_ms - idle_ms;
}
-void ovs_match_init(struct sw_flow_match *match,
- struct sw_flow_key *key,
- struct sw_flow_mask *mask)
-{
- memset(match, 0, sizeof(*match));
- match->key = key;
- match->mask = mask;
+#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF))
- memset(key, 0, sizeof(*key));
-
- if (mask) {
- memset(&mask->key, 0, sizeof(mask->key));
- mask->range.start = mask->range.end = 0;
- }
-}
-
-static bool ovs_match_validate(const struct sw_flow_match *match,
- u64 key_attrs, u64 mask_attrs)
+void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
{
- u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
- u64 mask_allowed = key_attrs; /* At most allow all key attributes */
-
- /* The following mask attributes allowed only if they
- * pass the validation tests. */
- mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
- | (1 << OVS_KEY_ATTR_IPV6)
- | (1 << OVS_KEY_ATTR_TCP)
- | (1 << OVS_KEY_ATTR_UDP)
- | (1 << OVS_KEY_ATTR_SCTP)
- | (1 << OVS_KEY_ATTR_ICMP)
- | (1 << OVS_KEY_ATTR_ICMPV6)
- | (1 << OVS_KEY_ATTR_ARP)
- | (1 << OVS_KEY_ATTR_ND));
-
- /* Always allowed mask fields. */
- mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
- | (1 << OVS_KEY_ATTR_IN_PORT)
- | (1 << OVS_KEY_ATTR_ETHERTYPE));
-
- /* Check key attributes. */
- if (match->key->eth.type == htons(ETH_P_ARP)
- || match->key->eth.type == htons(ETH_P_RARP)) {
- key_expected |= 1 << OVS_KEY_ATTR_ARP;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
- mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
- }
+ __be16 tcp_flags = 0;
- if (match->key->eth.type == htons(ETH_P_IP)) {
- key_expected |= 1 << OVS_KEY_ATTR_IPV4;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
- mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
-
- if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
- if (match->key->ip.proto == IPPROTO_UDP) {
- key_expected |= 1 << OVS_KEY_ATTR_UDP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
- }
-
- if (match->key->ip.proto == IPPROTO_SCTP) {
- key_expected |= 1 << OVS_KEY_ATTR_SCTP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
- }
-
- if (match->key->ip.proto == IPPROTO_TCP) {
- key_expected |= 1 << OVS_KEY_ATTR_TCP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
- }
-
- if (match->key->ip.proto == IPPROTO_ICMP) {
- key_expected |= 1 << OVS_KEY_ATTR_ICMP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
- }
- }
- }
-
- if (match->key->eth.type == htons(ETH_P_IPV6)) {
- key_expected |= 1 << OVS_KEY_ATTR_IPV6;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
- mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
-
- if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
- if (match->key->ip.proto == IPPROTO_UDP) {
- key_expected |= 1 << OVS_KEY_ATTR_UDP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
- }
-
- if (match->key->ip.proto == IPPROTO_SCTP) {
- key_expected |= 1 << OVS_KEY_ATTR_SCTP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
- }
-
- if (match->key->ip.proto == IPPROTO_TCP) {
- key_expected |= 1 << OVS_KEY_ATTR_TCP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
- }
-
- if (match->key->ip.proto == IPPROTO_ICMPV6) {
- key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
-
- if (match->key->ipv6.tp.src ==
- htons(NDISC_NEIGHBOUR_SOLICITATION) ||
- match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
- key_expected |= 1 << OVS_KEY_ATTR_ND;
- if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff)))
- mask_allowed |= 1 << OVS_KEY_ATTR_ND;
- }
- }
- }
- }
-
- if ((key_attrs & key_expected) != key_expected) {
- /* Key attributes check failed. */
- OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
- key_attrs, key_expected);
- return false;
- }
-
- if ((mask_attrs & mask_allowed) != mask_attrs) {
- /* Mask attributes check failed. */
- OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
- mask_attrs, mask_allowed);
- return false;
+ if ((flow->key.eth.type == htons(ETH_P_IP) ||
+ flow->key.eth.type == htons(ETH_P_IPV6)) &&
+ flow->key.ip.proto == IPPROTO_TCP &&
+ likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) {
+ tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
}
- return true;
+ spin_lock(&flow->lock);
+ flow->used = jiffies;
+ flow->packet_count++;
+ flow->byte_count += skb->len;
+ flow->tcp_flags |= tcp_flags;
+ spin_unlock(&flow->lock);
}
static int check_header(struct sk_buff *skb, int len)
@@ -311,19 +147,6 @@ static bool icmphdr_ok(struct sk_buff *skb)
sizeof(struct icmphdr));
}
-u64 ovs_flow_used_time(unsigned long flow_jiffies)
-{
- struct timespec cur_ts;
- u64 cur_ms, idle_ms;
-
- ktime_get_ts(&cur_ts);
- idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
- cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
- cur_ts.tv_nsec / NSEC_PER_MSEC;
-
- return cur_ms - idle_ms;
-}
-
static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
{
unsigned int nh_ofs = skb_network_offset(skb);
@@ -372,311 +195,6 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
sizeof(struct icmp6hdr));
}
-void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
- const struct sw_flow_mask *mask)
-{
- const long *m = (long *)((u8 *)&mask->key + mask->range.start);
- const long *s = (long *)((u8 *)src + mask->range.start);
- long *d = (long *)((u8 *)dst + mask->range.start);
- int i;
-
- /* The memory outside of the 'mask->range' are not set since
- * further operations on 'dst' only uses contents within
- * 'mask->range'.
- */
- for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
- *d++ = *s++ & *m++;
-}
-
-#define TCP_FLAGS_OFFSET 13
-#define TCP_FLAG_MASK 0x3f
-
-void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
-{
- u8 tcp_flags = 0;
-
- if ((flow->key.eth.type == htons(ETH_P_IP) ||
- flow->key.eth.type == htons(ETH_P_IPV6)) &&
- flow->key.ip.proto == IPPROTO_TCP &&
- likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) {
- u8 *tcp = (u8 *)tcp_hdr(skb);
- tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK;
- }
-
- spin_lock(&flow->lock);
- flow->used = jiffies;
- flow->packet_count++;
- flow->byte_count += skb->len;
- flow->tcp_flags |= tcp_flags;
- spin_unlock(&flow->lock);
-}
-
-struct sw_flow_actions *ovs_flow_actions_alloc(int size)
-{
- struct sw_flow_actions *sfa;
-
- if (size > MAX_ACTIONS_BUFSIZE)
- return ERR_PTR(-EINVAL);
-
- sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
- if (!sfa)
- return ERR_PTR(-ENOMEM);
-
- sfa->actions_len = 0;
- return sfa;
-}
-
-struct sw_flow *ovs_flow_alloc(void)
-{
- struct sw_flow *flow;
-
- flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
- if (!flow)
- return ERR_PTR(-ENOMEM);
-
- spin_lock_init(&flow->lock);
- flow->sf_acts = NULL;
- flow->mask = NULL;
-
- return flow;
-}
-
-static struct hlist_head *find_bucket(struct flow_table *table, u32 hash)
-{
- hash = jhash_1word(hash, table->hash_seed);
- return flex_array_get(table->buckets,
- (hash & (table->n_buckets - 1)));
-}
-
-static struct flex_array *alloc_buckets(unsigned int n_buckets)
-{
- struct flex_array *buckets;
- int i, err;
-
- buckets = flex_array_alloc(sizeof(struct hlist_head),
- n_buckets, GFP_KERNEL);
- if (!buckets)
- return NULL;
-
- err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
- if (err) {
- flex_array_free(buckets);
- return NULL;
- }
-
- for (i = 0; i < n_buckets; i++)
- INIT_HLIST_HEAD((struct hlist_head *)
- flex_array_get(buckets, i));
-
- return buckets;
-}
-
-static void free_buckets(struct flex_array *buckets)
-{
- flex_array_free(buckets);
-}
-
-static struct flow_table *__flow_tbl_alloc(int new_size)
-{
- struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
-
- if (!table)
- return NULL;
-
- table->buckets = alloc_buckets(new_size);
-
- if (!table->buckets) {
- kfree(table);
- return NULL;
- }
- table->n_buckets = new_size;
- table->count = 0;
- table->node_ver = 0;
- table->keep_flows = false;
- get_random_bytes(&table->hash_seed, sizeof(u32));
- table->mask_list = NULL;
-
- return table;
-}
-
-static void __flow_tbl_destroy(struct flow_table *table)
-{
- int i;
-
- if (table->keep_flows)
- goto skip_flows;
-
- for (i = 0; i < table->n_buckets; i++) {
- struct sw_flow *flow;
- struct hlist_head *head = flex_array_get(table->buckets, i);
- struct hlist_node *n;
- int ver = table->node_ver;
-
- hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
- hlist_del(&flow->hash_node[ver]);
- ovs_flow_free(flow, false);
- }
- }
-
- BUG_ON(!list_empty(table->mask_list));
- kfree(table->mask_list);
-
-skip_flows:
- free_buckets(table->buckets);
- kfree(table);
-}
-
-struct flow_table *ovs_flow_tbl_alloc(int new_size)
-{
- struct flow_table *table = __flow_tbl_alloc(new_size);
-
- if (!table)
- return NULL;
-
- table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL);
- if (!table->mask_list) {
- table->keep_flows = true;
- __flow_tbl_destroy(table);
- return NULL;
- }
- INIT_LIST_HEAD(table->mask_list);
-
- return table;
-}
-
-static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
-{
- struct flow_table *table = container_of(rcu, struct flow_table, rcu);
-
- __flow_tbl_destroy(table);
-}
-
-void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
-{
- if (!table)
- return;
-
- if (deferred)
- call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
- else
- __flow_tbl_destroy(table);
-}
-
-struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last)
-{
- struct sw_flow *flow;
- struct hlist_head *head;
- int ver;
- int i;
-
- ver = table->node_ver;
- while (*bucket < table->n_buckets) {
- i = 0;
- head = flex_array_get(table->buckets, *bucket);
- hlist_for_each_entry_rcu(flow, head, hash_node[ver]) {
- if (i < *last) {
- i++;
- continue;
- }
- *last = i + 1;
- return flow;
- }
- (*bucket)++;
- *last = 0;
- }
-
- return NULL;
-}
-
-static void __tbl_insert(struct flow_table *table, struct sw_flow *flow)
-{
- struct hlist_head *head;
-
- head = find_bucket(table, flow->hash);
- hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
-
- table->count++;
-}
-
-static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
-{
- int old_ver;
- int i;
-
- old_ver = old->node_ver;
- new->node_ver = !old_ver;
-
- /* Insert in new table. */
- for (i = 0; i < old->n_buckets; i++) {
- struct sw_flow *flow;
- struct hlist_head *head;
-
- head = flex_array_get(old->buckets, i);
-
- hlist_for_each_entry(flow, head, hash_node[old_ver])
- __tbl_insert(new, flow);
- }
-
- new->mask_list = old->mask_list;
- old->keep_flows = true;
-}
-
-static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets)
-{
- struct flow_table *new_table;
-
- new_table = __flow_tbl_alloc(n_buckets);
- if (!new_table)
- return ERR_PTR(-ENOMEM);
-
- flow_table_copy_flows(table, new_table);
-
- return new_table;
-}
-
-struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table)
-{
- return __flow_tbl_rehash(table, table->n_buckets);
-}
-
-struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
-{
- return __flow_tbl_rehash(table, table->n_buckets * 2);
-}
-
-static void __flow_free(struct sw_flow *flow)
-{
- kfree((struct sf_flow_acts __force *)flow->sf_acts);
- kmem_cache_free(flow_cache, flow);
-}
-
-static void rcu_free_flow_callback(struct rcu_head *rcu)
-{
- struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
-
- __flow_free(flow);
-}
-
-void ovs_flow_free(struct sw_flow *flow, bool deferred)
-{
- if (!flow)
- return;
-
- ovs_sw_flow_mask_del_ref(flow->mask, deferred);
-
- if (deferred)
- call_rcu(&flow->rcu, rcu_free_flow_callback);
- else
- __flow_free(flow);
-}
-
-/* Schedules 'sf_acts' to be freed after the next RCU grace period.
- * The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts)
-{
- kfree_rcu(sf_acts, rcu);
-}
-
static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
{
struct qtag_prefix {
@@ -910,6 +428,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
struct tcphdr *tcp = tcp_hdr(skb);
key->ipv4.tp.src = tcp->source;
key->ipv4.tp.dst = tcp->dest;
+ key->ipv4.tp.flags = TCP_FLAGS_BE16(tcp);
}
} else if (key->ip.proto == IPPROTO_UDP) {
if (udphdr_ok(skb)) {
@@ -978,6 +497,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
struct tcphdr *tcp = tcp_hdr(skb);
key->ipv6.tp.src = tcp->source;
key->ipv6.tp.dst = tcp->dest;
+ key->ipv6.tp.flags = TCP_FLAGS_BE16(tcp);
}
} else if (key->ip.proto == NEXTHDR_UDP) {
if (udphdr_ok(skb)) {
@@ -1002,1080 +522,3 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
return 0;
}
-
-static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start,
- int key_end)
-{
- u32 *hash_key = (u32 *)((u8 *)key + key_start);
- int hash_u32s = (key_end - key_start) >> 2;
-
- /* Make sure number of hash bytes are multiple of u32. */
- BUILD_BUG_ON(sizeof(long) % sizeof(u32));
-
- return jhash2(hash_key, hash_u32s, 0);
-}
-
-static int flow_key_start(const struct sw_flow_key *key)
-{
- if (key->tun_key.ipv4_dst)
- return 0;
- else
- return rounddown(offsetof(struct sw_flow_key, phy),
- sizeof(long));
-}
-
-static bool __cmp_key(const struct sw_flow_key *key1,
- const struct sw_flow_key *key2, int key_start, int key_end)
-{
- const long *cp1 = (long *)((u8 *)key1 + key_start);
- const long *cp2 = (long *)((u8 *)key2 + key_start);
- long diffs = 0;
- int i;
-
- for (i = key_start; i < key_end; i += sizeof(long))
- diffs |= *cp1++ ^ *cp2++;
-
- return diffs == 0;
-}
-
-static bool __flow_cmp_masked_key(const struct sw_flow *flow,
- const struct sw_flow_key *key, int key_start, int key_end)
-{
- return __cmp_key(&flow->key, key, key_start, key_end);
-}
-
-static bool __flow_cmp_unmasked_key(const struct sw_flow *flow,
- const struct sw_flow_key *key, int key_start, int key_end)
-{
- return __cmp_key(&flow->unmasked_key, key, key_start, key_end);
-}
-
-bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
- const struct sw_flow_key *key, int key_end)
-{
- int key_start;
- key_start = flow_key_start(key);
-
- return __flow_cmp_unmasked_key(flow, key, key_start, key_end);
-
-}
-
-struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
- struct sw_flow_match *match)
-{
- struct sw_flow_key *unmasked = match->key;
- int key_end = match->range.end;
- struct sw_flow *flow;
-
- flow = ovs_flow_lookup(table, unmasked);
- if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_end)))
- flow = NULL;
-
- return flow;
-}
-
-static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table,
- const struct sw_flow_key *unmasked,
- struct sw_flow_mask *mask)
-{
- struct sw_flow *flow;
- struct hlist_head *head;
- int key_start = mask->range.start;
- int key_end = mask->range.end;
- u32 hash;
- struct sw_flow_key masked_key;
-
- ovs_flow_key_mask(&masked_key, unmasked, mask);
- hash = ovs_flow_hash(&masked_key, key_start, key_end);
- head = find_bucket(table, hash);
- hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) {
- if (flow->mask == mask &&
- __flow_cmp_masked_key(flow, &masked_key,
- key_start, key_end))
- return flow;
- }
- return NULL;
-}
-
-struct sw_flow *ovs_flow_lookup(struct flow_table *tbl,
- const struct sw_flow_key *key)
-{
- struct sw_flow *flow = NULL;
- struct sw_flow_mask *mask;
-
- list_for_each_entry_rcu(mask, tbl->mask_list, list) {
- flow = ovs_masked_flow_lookup(tbl, key, mask);
- if (flow) /* Found */
- break;
- }
-
- return flow;
-}
-
-
-void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow)
-{
- flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start,
- flow->mask->range.end);
- __tbl_insert(table, flow);
-}
-
-void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow)
-{
- BUG_ON(table->count == 0);
- hlist_del_rcu(&flow->hash_node[table->node_ver]);
- table->count--;
-}
-
-/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
-const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
- [OVS_KEY_ATTR_ENCAP] = -1,
- [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
- [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
- [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32),
- [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
- [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
- [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
- [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
- [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
- [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
- [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
- [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
- [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
- [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
- [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
- [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
- [OVS_KEY_ATTR_TUNNEL] = -1,
-};
-
-static bool is_all_zero(const u8 *fp, size_t size)
-{
- int i;
-
- if (!fp)
- return false;
-
- for (i = 0; i < size; i++)
- if (fp[i])
- return false;
-
- return true;
-}
-
-static int __parse_flow_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[],
- u64 *attrsp, bool nz)
-{
- const struct nlattr *nla;
- u32 attrs;
- int rem;
-
- attrs = *attrsp;
- nla_for_each_nested(nla, attr, rem) {
- u16 type = nla_type(nla);
- int expected_len;
-
- if (type > OVS_KEY_ATTR_MAX) {
- OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n",
- type, OVS_KEY_ATTR_MAX);
- return -EINVAL;
- }
-
- if (attrs & (1 << type)) {
- OVS_NLERR("Duplicate key attribute (type %d).\n", type);
- return -EINVAL;
- }
-
- expected_len = ovs_key_lens[type];
- if (nla_len(nla) != expected_len && expected_len != -1) {
- OVS_NLERR("Key attribute has unexpected length (type=%d"
- ", length=%d, expected=%d).\n", type,
- nla_len(nla), expected_len);
- return -EINVAL;
- }
-
- if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
- attrs |= 1 << type;
- a[type] = nla;
- }
- }
- if (rem) {
- OVS_NLERR("Message has %d unknown bytes.\n", rem);
- return -EINVAL;
- }
-
- *attrsp = attrs;
- return 0;
-}
-
-static int parse_flow_mask_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[], u64 *attrsp)
-{
- return __parse_flow_nlattrs(attr, a, attrsp, true);
-}
-
-static int parse_flow_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[], u64 *attrsp)
-{
- return __parse_flow_nlattrs(attr, a, attrsp, false);
-}
-
-int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
- struct sw_flow_match *match, bool is_mask)
-{
- struct nlattr *a;
- int rem;
- bool ttl = false;
- __be16 tun_flags = 0;
-
- nla_for_each_nested(a, attr, rem) {
- int type = nla_type(a);
- static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
- [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
- [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
- [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
- [OVS_TUNNEL_KEY_ATTR_TOS] = 1,
- [OVS_TUNNEL_KEY_ATTR_TTL] = 1,
- [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
- [OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
- };
-
- if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
- OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n",
- type, OVS_TUNNEL_KEY_ATTR_MAX);
- return -EINVAL;
- }
-
- if (ovs_tunnel_key_lens[type] != nla_len(a)) {
- OVS_NLERR("IPv4 tunnel attribute type has unexpected "
- " length (type=%d, length=%d, expected=%d).\n",
- type, nla_len(a), ovs_tunnel_key_lens[type]);
- return -EINVAL;
- }
-
- switch (type) {
- case OVS_TUNNEL_KEY_ATTR_ID:
- SW_FLOW_KEY_PUT(match, tun_key.tun_id,
- nla_get_be64(a), is_mask);
- tun_flags |= TUNNEL_KEY;
- break;
- case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
- nla_get_be32(a), is_mask);
- break;
- case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
- nla_get_be32(a), is_mask);
- break;
- case OVS_TUNNEL_KEY_ATTR_TOS:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
- nla_get_u8(a), is_mask);
- break;
- case OVS_TUNNEL_KEY_ATTR_TTL:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
- nla_get_u8(a), is_mask);
- ttl = true;
- break;
- case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
- tun_flags |= TUNNEL_DONT_FRAGMENT;
- break;
- case OVS_TUNNEL_KEY_ATTR_CSUM:
- tun_flags |= TUNNEL_CSUM;
- break;
- default:
- return -EINVAL;
- }
- }
-
- SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
-
- if (rem > 0) {
- OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem);
- return -EINVAL;
- }
-
- if (!is_mask) {
- if (!match->key->tun_key.ipv4_dst) {
- OVS_NLERR("IPv4 tunnel destination address is zero.\n");
- return -EINVAL;
- }
-
- if (!ttl) {
- OVS_NLERR("IPv4 tunnel TTL not specified.\n");
- return -EINVAL;
- }
- }
-
- return 0;
-}
-
-int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *tun_key,
- const struct ovs_key_ipv4_tunnel *output)
-{
- struct nlattr *nla;
-
- nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
- if (!nla)
- return -EMSGSIZE;
-
- if (output->tun_flags & TUNNEL_KEY &&
- nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
- return -EMSGSIZE;
- if (output->ipv4_src &&
- nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
- return -EMSGSIZE;
- if (output->ipv4_dst &&
- nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
- return -EMSGSIZE;
- if (output->ipv4_tos &&
- nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
- return -EMSGSIZE;
- if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
- return -EMSGSIZE;
- if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
- nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
- return -EMSGSIZE;
- if ((output->tun_flags & TUNNEL_CSUM) &&
- nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
- return -EMSGSIZE;
-
- nla_nest_end(skb, nla);
- return 0;
-}
-
-static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
- const struct nlattr **a, bool is_mask)
-{
- if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
- SW_FLOW_KEY_PUT(match, phy.priority,
- nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
- *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
- }
-
- if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
- u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
-
- if (is_mask)
- in_port = 0xffffffff; /* Always exact match in_port. */
- else if (in_port >= DP_MAX_PORTS)
- return -EINVAL;
-
- SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
- *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
- } else if (!is_mask) {
- SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
- }
-
- if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
- uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
-
- SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
- *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
- }
- if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
- if (ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
- is_mask))
- return -EINVAL;
- *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
- }
- return 0;
-}
-
-static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
- const struct nlattr **a, bool is_mask)
-{
- int err;
- u64 orig_attrs = attrs;
-
- err = metadata_from_nlattrs(match, &attrs, a, is_mask);
- if (err)
- return err;
-
- if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
- const struct ovs_key_ethernet *eth_key;
-
- eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
- SW_FLOW_KEY_MEMCPY(match, eth.src,
- eth_key->eth_src, ETH_ALEN, is_mask);
- SW_FLOW_KEY_MEMCPY(match, eth.dst,
- eth_key->eth_dst, ETH_ALEN, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
- __be16 tci;
-
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- if (is_mask)
- OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n");
- else
- OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n");
-
- return -EINVAL;
- }
-
- SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
- } else if (!is_mask)
- SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
-
- if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
- __be16 eth_type;
-
- eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
- if (is_mask) {
- /* Always exact match EtherType. */
- eth_type = htons(0xffff);
- } else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
- OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n",
- ntohs(eth_type), ETH_P_802_3_MIN);
- return -EINVAL;
- }
-
- SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- } else if (!is_mask) {
- SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
- const struct ovs_key_ipv4 *ipv4_key;
-
- ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
- if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
- OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n",
- ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
- return -EINVAL;
- }
- SW_FLOW_KEY_PUT(match, ip.proto,
- ipv4_key->ipv4_proto, is_mask);
- SW_FLOW_KEY_PUT(match, ip.tos,
- ipv4_key->ipv4_tos, is_mask);
- SW_FLOW_KEY_PUT(match, ip.ttl,
- ipv4_key->ipv4_ttl, is_mask);
- SW_FLOW_KEY_PUT(match, ip.frag,
- ipv4_key->ipv4_frag, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.addr.src,
- ipv4_key->ipv4_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
- ipv4_key->ipv4_dst, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
- const struct ovs_key_ipv6 *ipv6_key;
-
- ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
- if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
- OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n",
- ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
- return -EINVAL;
- }
- SW_FLOW_KEY_PUT(match, ipv6.label,
- ipv6_key->ipv6_label, is_mask);
- SW_FLOW_KEY_PUT(match, ip.proto,
- ipv6_key->ipv6_proto, is_mask);
- SW_FLOW_KEY_PUT(match, ip.tos,
- ipv6_key->ipv6_tclass, is_mask);
- SW_FLOW_KEY_PUT(match, ip.ttl,
- ipv6_key->ipv6_hlimit, is_mask);
- SW_FLOW_KEY_PUT(match, ip.frag,
- ipv6_key->ipv6_frag, is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
- ipv6_key->ipv6_src,
- sizeof(match->key->ipv6.addr.src),
- is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
- ipv6_key->ipv6_dst,
- sizeof(match->key->ipv6.addr.dst),
- is_mask);
-
- attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
- const struct ovs_key_arp *arp_key;
-
- arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
- if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
- OVS_NLERR("Unknown ARP opcode (opcode=%d).\n",
- arp_key->arp_op);
- return -EINVAL;
- }
-
- SW_FLOW_KEY_PUT(match, ipv4.addr.src,
- arp_key->arp_sip, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
- arp_key->arp_tip, is_mask);
- SW_FLOW_KEY_PUT(match, ip.proto,
- ntohs(arp_key->arp_op), is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
- arp_key->arp_sha, ETH_ALEN, is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
- arp_key->arp_tha, ETH_ALEN, is_mask);
-
- attrs &= ~(1 << OVS_KEY_ATTR_ARP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
- const struct ovs_key_tcp *tcp_key;
-
- tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
- if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
- SW_FLOW_KEY_PUT(match, ipv4.tp.src,
- tcp_key->tcp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
- tcp_key->tcp_dst, is_mask);
- } else {
- SW_FLOW_KEY_PUT(match, ipv6.tp.src,
- tcp_key->tcp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
- tcp_key->tcp_dst, is_mask);
- }
- attrs &= ~(1 << OVS_KEY_ATTR_TCP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
- const struct ovs_key_udp *udp_key;
-
- udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
- if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
- SW_FLOW_KEY_PUT(match, ipv4.tp.src,
- udp_key->udp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
- udp_key->udp_dst, is_mask);
- } else {
- SW_FLOW_KEY_PUT(match, ipv6.tp.src,
- udp_key->udp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
- udp_key->udp_dst, is_mask);
- }
- attrs &= ~(1 << OVS_KEY_ATTR_UDP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
- const struct ovs_key_sctp *sctp_key;
-
- sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
- if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
- SW_FLOW_KEY_PUT(match, ipv4.tp.src,
- sctp_key->sctp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
- sctp_key->sctp_dst, is_mask);
- } else {
- SW_FLOW_KEY_PUT(match, ipv6.tp.src,
- sctp_key->sctp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
- sctp_key->sctp_dst, is_mask);
- }
- attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
- const struct ovs_key_icmp *icmp_key;
-
- icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
- SW_FLOW_KEY_PUT(match, ipv4.tp.src,
- htons(icmp_key->icmp_type), is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
- htons(icmp_key->icmp_code), is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
- const struct ovs_key_icmpv6 *icmpv6_key;
-
- icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
- SW_FLOW_KEY_PUT(match, ipv6.tp.src,
- htons(icmpv6_key->icmpv6_type), is_mask);
- SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
- htons(icmpv6_key->icmpv6_code), is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ND)) {
- const struct ovs_key_nd *nd_key;
-
- nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
- SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
- nd_key->nd_target,
- sizeof(match->key->ipv6.nd.target),
- is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
- nd_key->nd_sll, ETH_ALEN, is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
- nd_key->nd_tll, ETH_ALEN, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ND);
- }
-
- if (attrs != 0)
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and
- * mask. In case the 'mask' is NULL, the flow is treated as exact match
- * flow. Otherwise, it is treated as a wildcarded flow, except the mask
- * does not include any don't care bit.
- * @match: receives the extracted flow match information.
- * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence. The fields should of the packet that triggered the creation
- * of this flow.
- * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
- * attribute specifies the mask field of the wildcarded flow.
- */
-int ovs_match_from_nlattrs(struct sw_flow_match *match,
- const struct nlattr *key,
- const struct nlattr *mask)
-{
- const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
- const struct nlattr *encap;
- u64 key_attrs = 0;
- u64 mask_attrs = 0;
- bool encap_valid = false;
- int err;
-
- err = parse_flow_nlattrs(key, a, &key_attrs);
- if (err)
- return err;
-
- if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
- (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
- (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
- __be16 tci;
-
- if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
- (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
- OVS_NLERR("Invalid Vlan frame.\n");
- return -EINVAL;
- }
-
- key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- encap = a[OVS_KEY_ATTR_ENCAP];
- key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
- encap_valid = true;
-
- if (tci & htons(VLAN_TAG_PRESENT)) {
- err = parse_flow_nlattrs(encap, a, &key_attrs);
- if (err)
- return err;
- } else if (!tci) {
- /* Corner case for truncated 802.1Q header. */
- if (nla_len(encap)) {
- OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n");
- return -EINVAL;
- }
- } else {
- OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n");
- return -EINVAL;
- }
- }
-
- err = ovs_key_from_nlattrs(match, key_attrs, a, false);
- if (err)
- return err;
-
- if (mask) {
- err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
- if (err)
- return err;
-
- if (mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP) {
- __be16 eth_type = 0;
- __be16 tci = 0;
-
- if (!encap_valid) {
- OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n");
- return -EINVAL;
- }
-
- mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
- if (a[OVS_KEY_ATTR_ETHERTYPE])
- eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
-
- if (eth_type == htons(0xffff)) {
- mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- encap = a[OVS_KEY_ATTR_ENCAP];
- err = parse_flow_mask_nlattrs(encap, a, &mask_attrs);
- } else {
- OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n",
- ntohs(eth_type));
- return -EINVAL;
- }
-
- if (a[OVS_KEY_ATTR_VLAN])
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
-
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci));
- return -EINVAL;
- }
- }
-
- err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
- if (err)
- return err;
- } else {
- /* Populate exact match flow's key mask. */
- if (match->mask)
- ovs_sw_flow_mask_set(match->mask, &match->range, 0xff);
- }
-
- if (!ovs_match_validate(match, key_attrs, mask_attrs))
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
- * @flow: Receives extracted in_port, priority, tun_key and skb_mark.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
- *
- * This parses a series of Netlink attributes that form a flow key, which must
- * take the same form accepted by flow_from_nlattrs(), but only enough of it to
- * get the metadata, that is, the parts of the flow key that cannot be
- * extracted from the packet itself.
- */
-
-int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
- const struct nlattr *attr)
-{
- struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
- const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
- u64 attrs = 0;
- int err;
- struct sw_flow_match match;
-
- flow->key.phy.in_port = DP_MAX_PORTS;
- flow->key.phy.priority = 0;
- flow->key.phy.skb_mark = 0;
- memset(tun_key, 0, sizeof(flow->key.tun_key));
-
- err = parse_flow_nlattrs(attr, a, &attrs);
- if (err)
- return -EINVAL;
-
- memset(&match, 0, sizeof(match));
- match.key = &flow->key;
-
- err = metadata_from_nlattrs(&match, &attrs, a, false);
- if (err)
- return err;
-
- return 0;
-}
-
-int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey,
- const struct sw_flow_key *output, struct sk_buff *skb)
-{
- struct ovs_key_ethernet *eth_key;
- struct nlattr *nla, *encap;
- bool is_mask = (swkey != output);
-
- if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
- goto nla_put_failure;
-
- if ((swkey->tun_key.ipv4_dst || is_mask) &&
- ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
- goto nla_put_failure;
-
- if (swkey->phy.in_port == DP_MAX_PORTS) {
- if (is_mask && (output->phy.in_port == 0xffff))
- if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
- goto nla_put_failure;
- } else {
- u16 upper_u16;
- upper_u16 = !is_mask ? 0 : 0xffff;
-
- if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
- (upper_u16 << 16) | output->phy.in_port))
- goto nla_put_failure;
- }
-
- if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
- goto nla_put_failure;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
- if (!nla)
- goto nla_put_failure;
-
- eth_key = nla_data(nla);
- memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN);
- memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN);
-
- if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
- __be16 eth_type;
- eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
- nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
- goto nla_put_failure;
- encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
- if (!swkey->eth.tci)
- goto unencap;
- } else
- encap = NULL;
-
- if (swkey->eth.type == htons(ETH_P_802_2)) {
- /*
- * Ethertype 802.2 is represented in the netlink with omitted
- * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
- * 0xffff in the mask attribute. Ethertype can also
- * be wildcarded.
- */
- if (is_mask && output->eth.type)
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
- output->eth.type))
- goto nla_put_failure;
- goto unencap;
- }
-
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
- goto nla_put_failure;
-
- if (swkey->eth.type == htons(ETH_P_IP)) {
- struct ovs_key_ipv4 *ipv4_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
- if (!nla)
- goto nla_put_failure;
- ipv4_key = nla_data(nla);
- ipv4_key->ipv4_src = output->ipv4.addr.src;
- ipv4_key->ipv4_dst = output->ipv4.addr.dst;
- ipv4_key->ipv4_proto = output->ip.proto;
- ipv4_key->ipv4_tos = output->ip.tos;
- ipv4_key->ipv4_ttl = output->ip.ttl;
- ipv4_key->ipv4_frag = output->ip.frag;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- struct ovs_key_ipv6 *ipv6_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
- if (!nla)
- goto nla_put_failure;
- ipv6_key = nla_data(nla);
- memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
- sizeof(ipv6_key->ipv6_src));
- memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
- sizeof(ipv6_key->ipv6_dst));
- ipv6_key->ipv6_label = output->ipv6.label;
- ipv6_key->ipv6_proto = output->ip.proto;
- ipv6_key->ipv6_tclass = output->ip.tos;
- ipv6_key->ipv6_hlimit = output->ip.ttl;
- ipv6_key->ipv6_frag = output->ip.frag;
- } else if (swkey->eth.type == htons(ETH_P_ARP) ||
- swkey->eth.type == htons(ETH_P_RARP)) {
- struct ovs_key_arp *arp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
- if (!nla)
- goto nla_put_failure;
- arp_key = nla_data(nla);
- memset(arp_key, 0, sizeof(struct ovs_key_arp));
- arp_key->arp_sip = output->ipv4.addr.src;
- arp_key->arp_tip = output->ipv4.addr.dst;
- arp_key->arp_op = htons(output->ip.proto);
- memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
- memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
- }
-
- if ((swkey->eth.type == htons(ETH_P_IP) ||
- swkey->eth.type == htons(ETH_P_IPV6)) &&
- swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
-
- if (swkey->ip.proto == IPPROTO_TCP) {
- struct ovs_key_tcp *tcp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
- if (!nla)
- goto nla_put_failure;
- tcp_key = nla_data(nla);
- if (swkey->eth.type == htons(ETH_P_IP)) {
- tcp_key->tcp_src = output->ipv4.tp.src;
- tcp_key->tcp_dst = output->ipv4.tp.dst;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- tcp_key->tcp_src = output->ipv6.tp.src;
- tcp_key->tcp_dst = output->ipv6.tp.dst;
- }
- } else if (swkey->ip.proto == IPPROTO_UDP) {
- struct ovs_key_udp *udp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
- if (!nla)
- goto nla_put_failure;
- udp_key = nla_data(nla);
- if (swkey->eth.type == htons(ETH_P_IP)) {
- udp_key->udp_src = output->ipv4.tp.src;
- udp_key->udp_dst = output->ipv4.tp.dst;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- udp_key->udp_src = output->ipv6.tp.src;
- udp_key->udp_dst = output->ipv6.tp.dst;
- }
- } else if (swkey->ip.proto == IPPROTO_SCTP) {
- struct ovs_key_sctp *sctp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
- if (!nla)
- goto nla_put_failure;
- sctp_key = nla_data(nla);
- if (swkey->eth.type == htons(ETH_P_IP)) {
- sctp_key->sctp_src = swkey->ipv4.tp.src;
- sctp_key->sctp_dst = swkey->ipv4.tp.dst;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- sctp_key->sctp_src = swkey->ipv6.tp.src;
- sctp_key->sctp_dst = swkey->ipv6.tp.dst;
- }
- } else if (swkey->eth.type == htons(ETH_P_IP) &&
- swkey->ip.proto == IPPROTO_ICMP) {
- struct ovs_key_icmp *icmp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
- if (!nla)
- goto nla_put_failure;
- icmp_key = nla_data(nla);
- icmp_key->icmp_type = ntohs(output->ipv4.tp.src);
- icmp_key->icmp_code = ntohs(output->ipv4.tp.dst);
- } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
- swkey->ip.proto == IPPROTO_ICMPV6) {
- struct ovs_key_icmpv6 *icmpv6_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
- sizeof(*icmpv6_key));
- if (!nla)
- goto nla_put_failure;
- icmpv6_key = nla_data(nla);
- icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src);
- icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst);
-
- if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
- icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
- struct ovs_key_nd *nd_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
- if (!nla)
- goto nla_put_failure;
- nd_key = nla_data(nla);
- memcpy(nd_key->nd_target, &output->ipv6.nd.target,
- sizeof(nd_key->nd_target));
- memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN);
- memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN);
- }
- }
- }
-
-unencap:
- if (encap)
- nla_nest_end(skb, encap);
-
- return 0;
-
-nla_put_failure:
- return -EMSGSIZE;
-}
-
-/* Initializes the flow module.
- * Returns zero if successful or a negative error code. */
-int ovs_flow_init(void)
-{
- BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
- BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
-
- flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
- 0, NULL);
- if (flow_cache == NULL)
- return -ENOMEM;
-
- return 0;
-}
-
-/* Uninitializes the flow module. */
-void ovs_flow_exit(void)
-{
- kmem_cache_destroy(flow_cache);
-}
-
-struct sw_flow_mask *ovs_sw_flow_mask_alloc(void)
-{
- struct sw_flow_mask *mask;
-
- mask = kmalloc(sizeof(*mask), GFP_KERNEL);
- if (mask)
- mask->ref_count = 0;
-
- return mask;
-}
-
-void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask)
-{
- mask->ref_count++;
-}
-
-void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred)
-{
- if (!mask)
- return;
-
- BUG_ON(!mask->ref_count);
- mask->ref_count--;
-
- if (!mask->ref_count) {
- list_del_rcu(&mask->list);
- if (deferred)
- kfree_rcu(mask, rcu);
- else
- kfree(mask);
- }
-}
-
-static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a,
- const struct sw_flow_mask *b)
-{
- u8 *a_ = (u8 *)&a->key + a->range.start;
- u8 *b_ = (u8 *)&b->key + b->range.start;
-
- return (a->range.end == b->range.end)
- && (a->range.start == b->range.start)
- && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
-}
-
-struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl,
- const struct sw_flow_mask *mask)
-{
- struct list_head *ml;
-
- list_for_each(ml, tbl->mask_list) {
- struct sw_flow_mask *m;
- m = container_of(ml, struct sw_flow_mask, list);
- if (ovs_sw_flow_mask_equal(mask, m))
- return m;
- }
-
- return NULL;
-}
-
-/**
- * add a new mask into the mask list.
- * The caller needs to make sure that 'mask' is not the same
- * as any masks that are already on the list.
- */
-void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask)
-{
- list_add_rcu(&mask->list, tbl->mask_list);
-}
-
-/**
- * Set 'range' fields in the mask to the value of 'val'.
- */
-static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
- struct sw_flow_key_range *range, u8 val)
-{
- u8 *m = (u8 *)&mask->key + range->start;
-
- mask->range = *range;
- memset(m, val, range_n_bytes(range));
-}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 212fbf7510c4..1510f51dbf74 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -33,14 +33,6 @@
#include <net/inet_ecn.h>
struct sk_buff;
-struct sw_flow_mask;
-struct flow_table;
-
-struct sw_flow_actions {
- struct rcu_head rcu;
- u32 actions_len;
- struct nlattr actions[];
-};
/* Used to memset ovs_key_ipv4_tunnel padding. */
#define OVS_TUNNEL_KEY_SIZE \
@@ -101,6 +93,7 @@ struct sw_flow_key {
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
+ __be16 flags; /* TCP flags. */
} tp;
struct {
u8 sha[ETH_ALEN]; /* ARP source hardware address. */
@@ -117,6 +110,7 @@ struct sw_flow_key {
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
+ __be16 flags; /* TCP flags. */
} tp;
struct {
struct in6_addr target; /* ND target address. */
@@ -127,6 +121,31 @@ struct sw_flow_key {
};
} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
+struct sw_flow_key_range {
+ size_t start;
+ size_t end;
+};
+
+struct sw_flow_mask {
+ int ref_count;
+ struct rcu_head rcu;
+ struct list_head list;
+ struct sw_flow_key_range range;
+ struct sw_flow_key key;
+};
+
+struct sw_flow_match {
+ struct sw_flow_key *key;
+ struct sw_flow_key_range range;
+ struct sw_flow_mask *mask;
+};
+
+struct sw_flow_actions {
+ struct rcu_head rcu;
+ u32 actions_len;
+ struct nlattr actions[];
+};
+
struct sw_flow {
struct rcu_head rcu;
struct hlist_node hash_node[2];
@@ -141,23 +160,9 @@ struct sw_flow {
unsigned long used; /* Last used time (in jiffies). */
u64 packet_count; /* Number of packets matched. */
u64 byte_count; /* Number of bytes matched. */
- u8 tcp_flags; /* Union of seen TCP flags. */
-};
-
-struct sw_flow_key_range {
- size_t start;
- size_t end;
+ __be16 tcp_flags; /* Union of seen TCP flags. */
};
-struct sw_flow_match {
- struct sw_flow_key *key;
- struct sw_flow_key_range range;
- struct sw_flow_mask *mask;
-};
-
-void ovs_match_init(struct sw_flow_match *match,
- struct sw_flow_key *key, struct sw_flow_mask *mask);
-
struct arp_eth_header {
__be16 ar_hrd; /* format of hardware address */
__be16 ar_pro; /* format of protocol address */
@@ -172,88 +177,9 @@ struct arp_eth_header {
unsigned char ar_tip[4]; /* target IP address */
} __packed;
-int ovs_flow_init(void);
-void ovs_flow_exit(void);
-
-struct sw_flow *ovs_flow_alloc(void);
-void ovs_flow_deferred_free(struct sw_flow *);
-void ovs_flow_free(struct sw_flow *, bool deferred);
-
-struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len);
-void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
-
-int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *);
void ovs_flow_used(struct sw_flow *, struct sk_buff *);
u64 ovs_flow_used_time(unsigned long flow_jiffies);
-int ovs_flow_to_nlattrs(const struct sw_flow_key *,
- const struct sw_flow_key *, struct sk_buff *);
-int ovs_match_from_nlattrs(struct sw_flow_match *match,
- const struct nlattr *,
- const struct nlattr *);
-int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
- const struct nlattr *attr);
-#define MAX_ACTIONS_BUFSIZE (32 * 1024)
-#define TBL_MIN_BUCKETS 1024
-
-struct flow_table {
- struct flex_array *buckets;
- unsigned int count, n_buckets;
- struct rcu_head rcu;
- struct list_head *mask_list;
- int node_ver;
- u32 hash_seed;
- bool keep_flows;
-};
-
-static inline int ovs_flow_tbl_count(struct flow_table *table)
-{
- return table->count;
-}
-
-static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table)
-{
- return (table->count > table->n_buckets);
-}
-
-struct sw_flow *ovs_flow_lookup(struct flow_table *,
- const struct sw_flow_key *);
-struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
- struct sw_flow_match *match);
-
-void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred);
-struct flow_table *ovs_flow_tbl_alloc(int new_size);
-struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
-struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
-
-void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow);
-void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow);
-
-struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *idx);
-extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
-int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
- struct sw_flow_match *match, bool is_mask);
-int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *tun_key,
- const struct ovs_key_ipv4_tunnel *output);
-
-bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
- const struct sw_flow_key *key, int key_end);
-
-struct sw_flow_mask {
- int ref_count;
- struct rcu_head rcu;
- struct list_head list;
- struct sw_flow_key_range range;
- struct sw_flow_key key;
-};
+int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *);
-struct sw_flow_mask *ovs_sw_flow_mask_alloc(void);
-void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *);
-void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred);
-void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *);
-struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *,
- const struct sw_flow_mask *);
-void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
- const struct sw_flow_mask *mask);
#endif /* flow.h */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
new file mode 100644
index 000000000000..2bc1bc1aca3b
--- /dev/null
+++ b/net/openvswitch/flow_netlink.c
@@ -0,0 +1,1630 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include "flow.h"
+#include "datapath.h"
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/sctp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+
+#include "flow_netlink.h"
+
+static void update_range__(struct sw_flow_match *match,
+ size_t offset, size_t size, bool is_mask)
+{
+ struct sw_flow_key_range *range = NULL;
+ size_t start = rounddown(offset, sizeof(long));
+ size_t end = roundup(offset + size, sizeof(long));
+
+ if (!is_mask)
+ range = &match->range;
+ else if (match->mask)
+ range = &match->mask->range;
+
+ if (!range)
+ return;
+
+ if (range->start == range->end) {
+ range->start = start;
+ range->end = end;
+ return;
+ }
+
+ if (range->start > start)
+ range->start = start;
+
+ if (range->end < end)
+ range->end = end;
+}
+
+#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
+ do { \
+ update_range__(match, offsetof(struct sw_flow_key, field), \
+ sizeof((match)->key->field), is_mask); \
+ if (is_mask) { \
+ if ((match)->mask) \
+ (match)->mask->key.field = value; \
+ } else { \
+ (match)->key->field = value; \
+ } \
+ } while (0)
+
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+ do { \
+ update_range__(match, offsetof(struct sw_flow_key, field), \
+ len, is_mask); \
+ if (is_mask) { \
+ if ((match)->mask) \
+ memcpy(&(match)->mask->key.field, value_p, len);\
+ } else { \
+ memcpy(&(match)->key->field, value_p, len); \
+ } \
+ } while (0)
+
+static u16 range_n_bytes(const struct sw_flow_key_range *range)
+{
+ return range->end - range->start;
+}
+
+static bool match_validate(const struct sw_flow_match *match,
+ u64 key_attrs, u64 mask_attrs)
+{
+ u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+ u64 mask_allowed = key_attrs; /* At most allow all key attributes */
+
+ /* The following mask attributes allowed only if they
+ * pass the validation tests. */
+ mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+ | (1 << OVS_KEY_ATTR_IPV6)
+ | (1 << OVS_KEY_ATTR_TCP)
+ | (1 << OVS_KEY_ATTR_TCP_FLAGS)
+ | (1 << OVS_KEY_ATTR_UDP)
+ | (1 << OVS_KEY_ATTR_SCTP)
+ | (1 << OVS_KEY_ATTR_ICMP)
+ | (1 << OVS_KEY_ATTR_ICMPV6)
+ | (1 << OVS_KEY_ATTR_ARP)
+ | (1 << OVS_KEY_ATTR_ND));
+
+ /* Always allowed mask fields. */
+ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
+ | (1 << OVS_KEY_ATTR_IN_PORT)
+ | (1 << OVS_KEY_ATTR_ETHERTYPE));
+
+ /* Check key attributes. */
+ if (match->key->eth.type == htons(ETH_P_ARP)
+ || match->key->eth.type == htons(ETH_P_RARP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ARP;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV4;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_SCTP) {
+ key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ if (match->mask && (match->mask->key.ip.proto == 0xff)) {
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ }
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMP) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
+ }
+ }
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IPV6)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV6;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_SCTP) {
+ key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ if (match->mask && (match->mask->key.ip.proto == 0xff)) {
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ }
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMPV6) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
+
+ if (match->key->ipv6.tp.src ==
+ htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ND;
+ if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ND;
+ }
+ }
+ }
+ }
+
+ if ((key_attrs & key_expected) != key_expected) {
+ /* Key attributes check failed. */
+ OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
+ key_attrs, key_expected);
+ return false;
+ }
+
+ if ((mask_attrs & mask_allowed) != mask_attrs) {
+ /* Mask attributes check failed. */
+ OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
+ mask_attrs, mask_allowed);
+ return false;
+ }
+
+ return true;
+}
+
+/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
+static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
+ [OVS_KEY_ATTR_ENCAP] = -1,
+ [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
+ [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
+ [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32),
+ [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
+ [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
+ [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
+ [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
+ [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
+ [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
+ [OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16),
+ [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
+ [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
+ [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
+ [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
+ [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
+ [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
+ [OVS_KEY_ATTR_TUNNEL] = -1,
+};
+
+static bool is_all_zero(const u8 *fp, size_t size)
+{
+ int i;
+
+ if (!fp)
+ return false;
+
+ for (i = 0; i < size; i++)
+ if (fp[i])
+ return false;
+
+ return true;
+}
+
+static int __parse_flow_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[],
+ u64 *attrsp, bool nz)
+{
+ const struct nlattr *nla;
+ u64 attrs;
+ int rem;
+
+ attrs = *attrsp;
+ nla_for_each_nested(nla, attr, rem) {
+ u16 type = nla_type(nla);
+ int expected_len;
+
+ if (type > OVS_KEY_ATTR_MAX) {
+ OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n",
+ type, OVS_KEY_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (attrs & (1 << type)) {
+ OVS_NLERR("Duplicate key attribute (type %d).\n", type);
+ return -EINVAL;
+ }
+
+ expected_len = ovs_key_lens[type];
+ if (nla_len(nla) != expected_len && expected_len != -1) {
+ OVS_NLERR("Key attribute has unexpected length (type=%d"
+ ", length=%d, expected=%d).\n", type,
+ nla_len(nla), expected_len);
+ return -EINVAL;
+ }
+
+ if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
+ attrs |= 1 << type;
+ a[type] = nla;
+ }
+ }
+ if (rem) {
+ OVS_NLERR("Message has %d unknown bytes.\n", rem);
+ return -EINVAL;
+ }
+
+ *attrsp = attrs;
+ return 0;
+}
+
+static int parse_flow_mask_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[], u64 *attrsp)
+{
+ return __parse_flow_nlattrs(attr, a, attrsp, true);
+}
+
+static int parse_flow_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[], u64 *attrsp)
+{
+ return __parse_flow_nlattrs(attr, a, attrsp, false);
+}
+
+static int ipv4_tun_from_nlattr(const struct nlattr *attr,
+ struct sw_flow_match *match, bool is_mask)
+{
+ struct nlattr *a;
+ int rem;
+ bool ttl = false;
+ __be16 tun_flags = 0;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
+ [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
+ [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
+ [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
+ [OVS_TUNNEL_KEY_ATTR_TOS] = 1,
+ [OVS_TUNNEL_KEY_ATTR_TTL] = 1,
+ [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
+ [OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
+ };
+
+ if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
+ OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n",
+ type, OVS_TUNNEL_KEY_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (ovs_tunnel_key_lens[type] != nla_len(a)) {
+ OVS_NLERR("IPv4 tunnel attribute type has unexpected "
+ " length (type=%d, length=%d, expected=%d).\n",
+ type, nla_len(a), ovs_tunnel_key_lens[type]);
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case OVS_TUNNEL_KEY_ATTR_ID:
+ SW_FLOW_KEY_PUT(match, tun_key.tun_id,
+ nla_get_be64(a), is_mask);
+ tun_flags |= TUNNEL_KEY;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+ nla_get_be32(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+ nla_get_be32(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TOS:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+ nla_get_u8(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TTL:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+ nla_get_u8(a), is_mask);
+ ttl = true;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
+ tun_flags |= TUNNEL_DONT_FRAGMENT;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_CSUM:
+ tun_flags |= TUNNEL_CSUM;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
+
+ if (rem > 0) {
+ OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem);
+ return -EINVAL;
+ }
+
+ if (!is_mask) {
+ if (!match->key->tun_key.ipv4_dst) {
+ OVS_NLERR("IPv4 tunnel destination address is zero.\n");
+ return -EINVAL;
+ }
+
+ if (!ttl) {
+ OVS_NLERR("IPv4 tunnel TTL not specified.\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ipv4_tun_to_nlattr(struct sk_buff *skb,
+ const struct ovs_key_ipv4_tunnel *tun_key,
+ const struct ovs_key_ipv4_tunnel *output)
+{
+ struct nlattr *nla;
+
+ nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
+ if (!nla)
+ return -EMSGSIZE;
+
+ if (output->tun_flags & TUNNEL_KEY &&
+ nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
+ return -EMSGSIZE;
+ if (output->ipv4_src &&
+ nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
+ return -EMSGSIZE;
+ if (output->ipv4_dst &&
+ nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
+ return -EMSGSIZE;
+ if (output->ipv4_tos &&
+ nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+ return -EMSGSIZE;
+ if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
+ return -EMSGSIZE;
+ if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+ nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
+ return -EMSGSIZE;
+ if ((output->tun_flags & TUNNEL_CSUM) &&
+ nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+
+static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
+ const struct nlattr **a, bool is_mask)
+{
+ if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
+ SW_FLOW_KEY_PUT(match, phy.priority,
+ nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
+ u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
+
+ if (is_mask)
+ in_port = 0xffffffff; /* Always exact match in_port. */
+ else if (in_port >= DP_MAX_PORTS)
+ return -EINVAL;
+
+ SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
+ uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
+
+ SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
+ }
+ if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
+ if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
+ is_mask))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
+ }
+ return 0;
+}
+
+static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
+ const struct nlattr **a, bool is_mask)
+{
+ int err;
+ u64 orig_attrs = attrs;
+
+ err = metadata_from_nlattrs(match, &attrs, a, is_mask);
+ if (err)
+ return err;
+
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
+ const struct ovs_key_ethernet *eth_key;
+
+ eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
+ SW_FLOW_KEY_MEMCPY(match, eth.src,
+ eth_key->eth_src, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, eth.dst,
+ eth_key->eth_dst, ETH_ALEN, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
+ __be16 tci;
+
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ if (is_mask)
+ OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n");
+ else
+ OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n");
+
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+ } else if (!is_mask)
+ SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
+
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
+ __be16 eth_type;
+
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+ if (is_mask) {
+ /* Always exact match EtherType. */
+ eth_type = htons(0xffff);
+ } else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+ OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n",
+ ntohs(eth_type), ETH_P_802_3_MIN);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ const struct ovs_key_ipv4 *ipv4_key;
+
+ ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
+ if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
+ OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n",
+ ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
+ return -EINVAL;
+ }
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ipv4_key->ipv4_proto, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.tos,
+ ipv4_key->ipv4_tos, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.ttl,
+ ipv4_key->ipv4_ttl, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.frag,
+ ipv4_key->ipv4_frag, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+ ipv4_key->ipv4_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+ ipv4_key->ipv4_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
+ const struct ovs_key_ipv6 *ipv6_key;
+
+ ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
+ if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
+ OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n",
+ ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
+ return -EINVAL;
+ }
+ SW_FLOW_KEY_PUT(match, ipv6.label,
+ ipv6_key->ipv6_label, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ipv6_key->ipv6_proto, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.tos,
+ ipv6_key->ipv6_tclass, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.ttl,
+ ipv6_key->ipv6_hlimit, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.frag,
+ ipv6_key->ipv6_frag, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
+ ipv6_key->ipv6_src,
+ sizeof(match->key->ipv6.addr.src),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
+ ipv6_key->ipv6_dst,
+ sizeof(match->key->ipv6.addr.dst),
+ is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
+ const struct ovs_key_arp *arp_key;
+
+ arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
+ if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
+ OVS_NLERR("Unknown ARP opcode (opcode=%d).\n",
+ arp_key->arp_op);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+ arp_key->arp_sip, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+ arp_key->arp_tip, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ntohs(arp_key->arp_op), is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
+ arp_key->arp_sha, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
+ arp_key->arp_tha, ETH_ALEN, is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_ARP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
+ const struct ovs_key_tcp *tcp_key;
+
+ tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
+ if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+ tcp_key->tcp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+ tcp_key->tcp_dst, is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+ tcp_key->tcp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+ tcp_key->tcp_dst, is_mask);
+ }
+ attrs &= ~(1 << OVS_KEY_ATTR_TCP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) {
+ if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ SW_FLOW_KEY_PUT(match, ipv4.tp.flags,
+ nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+ is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, ipv6.tp.flags,
+ nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+ is_mask);
+ }
+ attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
+ const struct ovs_key_udp *udp_key;
+
+ udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
+ if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+ udp_key->udp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+ udp_key->udp_dst, is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+ udp_key->udp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+ udp_key->udp_dst, is_mask);
+ }
+ attrs &= ~(1 << OVS_KEY_ATTR_UDP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
+ const struct ovs_key_sctp *sctp_key;
+
+ sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
+ if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+ sctp_key->sctp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+ sctp_key->sctp_dst, is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+ sctp_key->sctp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+ sctp_key->sctp_dst, is_mask);
+ }
+ attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
+ const struct ovs_key_icmp *icmp_key;
+
+ icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+ htons(icmp_key->icmp_type), is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+ htons(icmp_key->icmp_code), is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
+ const struct ovs_key_icmpv6 *icmpv6_key;
+
+ icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+ htons(icmpv6_key->icmpv6_type), is_mask);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+ htons(icmpv6_key->icmpv6_code), is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ND)) {
+ const struct ovs_key_nd *nd_key;
+
+ nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
+ nd_key->nd_target,
+ sizeof(match->key->ipv6.nd.target),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
+ nd_key->nd_sll, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
+ nd_key->nd_tll, ETH_ALEN, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ND);
+ }
+
+ if (attrs != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sw_flow_mask_set(struct sw_flow_mask *mask,
+ struct sw_flow_key_range *range, u8 val)
+{
+ u8 *m = (u8 *)&mask->key + range->start;
+
+ mask->range = *range;
+ memset(m, val, range_n_bytes(range));
+}
+
+/**
+ * ovs_nla_get_match - parses Netlink attributes into a flow key and
+ * mask. In case the 'mask' is NULL, the flow is treated as exact match
+ * flow. Otherwise, it is treated as a wildcarded flow, except the mask
+ * does not include any don't care bit.
+ * @match: receives the extracted flow match information.
+ * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence. The fields should of the packet that triggered the creation
+ * of this flow.
+ * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
+ * attribute specifies the mask field of the wildcarded flow.
+ */
+int ovs_nla_get_match(struct sw_flow_match *match,
+ const struct nlattr *key,
+ const struct nlattr *mask)
+{
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ const struct nlattr *encap;
+ u64 key_attrs = 0;
+ u64 mask_attrs = 0;
+ bool encap_valid = false;
+ int err;
+
+ err = parse_flow_nlattrs(key, a, &key_attrs);
+ if (err)
+ return err;
+
+ if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
+ (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
+ __be16 tci;
+
+ if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
+ OVS_NLERR("Invalid Vlan frame.\n");
+ return -EINVAL;
+ }
+
+ key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ encap = a[OVS_KEY_ATTR_ENCAP];
+ key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ encap_valid = true;
+
+ if (tci & htons(VLAN_TAG_PRESENT)) {
+ err = parse_flow_nlattrs(encap, a, &key_attrs);
+ if (err)
+ return err;
+ } else if (!tci) {
+ /* Corner case for truncated 802.1Q header. */
+ if (nla_len(encap)) {
+ OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n");
+ return -EINVAL;
+ }
+ } else {
+ OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n");
+ return -EINVAL;
+ }
+ }
+
+ err = ovs_key_from_nlattrs(match, key_attrs, a, false);
+ if (err)
+ return err;
+
+ if (mask) {
+ err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
+ if (err)
+ return err;
+
+ if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) {
+ __be16 eth_type = 0;
+ __be16 tci = 0;
+
+ if (!encap_valid) {
+ OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n");
+ return -EINVAL;
+ }
+
+ mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ if (a[OVS_KEY_ATTR_ETHERTYPE])
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+ if (eth_type == htons(0xffff)) {
+ mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ encap = a[OVS_KEY_ATTR_ENCAP];
+ err = parse_flow_mask_nlattrs(encap, a, &mask_attrs);
+ } else {
+ OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n",
+ ntohs(eth_type));
+ return -EINVAL;
+ }
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci));
+ return -EINVAL;
+ }
+ }
+
+ err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
+ if (err)
+ return err;
+ } else {
+ /* Populate exact match flow's key mask. */
+ if (match->mask)
+ sw_flow_mask_set(match->mask, &match->range, 0xff);
+ }
+
+ if (!match_validate(match, key_attrs, mask_attrs))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
+ * @flow: Receives extracted in_port, priority, tun_key and skb_mark.
+ * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence.
+ *
+ * This parses a series of Netlink attributes that form a flow key, which must
+ * take the same form accepted by flow_from_nlattrs(), but only enough of it to
+ * get the metadata, that is, the parts of the flow key that cannot be
+ * extracted from the packet itself.
+ */
+
+int ovs_nla_get_flow_metadata(struct sw_flow *flow,
+ const struct nlattr *attr)
+{
+ struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ u64 attrs = 0;
+ int err;
+ struct sw_flow_match match;
+
+ flow->key.phy.in_port = DP_MAX_PORTS;
+ flow->key.phy.priority = 0;
+ flow->key.phy.skb_mark = 0;
+ memset(tun_key, 0, sizeof(flow->key.tun_key));
+
+ err = parse_flow_nlattrs(attr, a, &attrs);
+ if (err)
+ return -EINVAL;
+
+ memset(&match, 0, sizeof(match));
+ match.key = &flow->key;
+
+ err = metadata_from_nlattrs(&match, &attrs, a, false);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int ovs_nla_put_flow(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, struct sk_buff *skb)
+{
+ struct ovs_key_ethernet *eth_key;
+ struct nlattr *nla, *encap;
+ bool is_mask = (swkey != output);
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
+ goto nla_put_failure;
+
+ if ((swkey->tun_key.ipv4_dst || is_mask) &&
+ ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
+ goto nla_put_failure;
+
+ if (swkey->phy.in_port == DP_MAX_PORTS) {
+ if (is_mask && (output->phy.in_port == 0xffff))
+ if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
+ goto nla_put_failure;
+ } else {
+ u16 upper_u16;
+ upper_u16 = !is_mask ? 0 : 0xffff;
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
+ (upper_u16 << 16) | output->phy.in_port))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
+ if (!nla)
+ goto nla_put_failure;
+
+ eth_key = nla_data(nla);
+ memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN);
+ memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN);
+
+ if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
+ __be16 eth_type;
+ eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+ nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
+ goto nla_put_failure;
+ encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.tci)
+ goto unencap;
+ } else
+ encap = NULL;
+
+ if (swkey->eth.type == htons(ETH_P_802_2)) {
+ /*
+ * Ethertype 802.2 is represented in the netlink with omitted
+ * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+ * 0xffff in the mask attribute. Ethertype can also
+ * be wildcarded.
+ */
+ if (is_mask && output->eth.type)
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+ output->eth.type))
+ goto nla_put_failure;
+ goto unencap;
+ }
+
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
+ goto nla_put_failure;
+
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ struct ovs_key_ipv4 *ipv4_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv4_key = nla_data(nla);
+ ipv4_key->ipv4_src = output->ipv4.addr.src;
+ ipv4_key->ipv4_dst = output->ipv4.addr.dst;
+ ipv4_key->ipv4_proto = output->ip.proto;
+ ipv4_key->ipv4_tos = output->ip.tos;
+ ipv4_key->ipv4_ttl = output->ip.ttl;
+ ipv4_key->ipv4_frag = output->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ struct ovs_key_ipv6 *ipv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv6_key = nla_data(nla);
+ memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
+ sizeof(ipv6_key->ipv6_src));
+ memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
+ sizeof(ipv6_key->ipv6_dst));
+ ipv6_key->ipv6_label = output->ipv6.label;
+ ipv6_key->ipv6_proto = output->ip.proto;
+ ipv6_key->ipv6_tclass = output->ip.tos;
+ ipv6_key->ipv6_hlimit = output->ip.ttl;
+ ipv6_key->ipv6_frag = output->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_ARP) ||
+ swkey->eth.type == htons(ETH_P_RARP)) {
+ struct ovs_key_arp *arp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
+ if (!nla)
+ goto nla_put_failure;
+ arp_key = nla_data(nla);
+ memset(arp_key, 0, sizeof(struct ovs_key_arp));
+ arp_key->arp_sip = output->ipv4.addr.src;
+ arp_key->arp_tip = output->ipv4.addr.dst;
+ arp_key->arp_op = htons(output->ip.proto);
+ memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
+ memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
+ }
+
+ if ((swkey->eth.type == htons(ETH_P_IP) ||
+ swkey->eth.type == htons(ETH_P_IPV6)) &&
+ swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
+
+ if (swkey->ip.proto == IPPROTO_TCP) {
+ struct ovs_key_tcp *tcp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
+ if (!nla)
+ goto nla_put_failure;
+ tcp_key = nla_data(nla);
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ tcp_key->tcp_src = output->ipv4.tp.src;
+ tcp_key->tcp_dst = output->ipv4.tp.dst;
+ if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
+ output->ipv4.tp.flags))
+ goto nla_put_failure;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ tcp_key->tcp_src = output->ipv6.tp.src;
+ tcp_key->tcp_dst = output->ipv6.tp.dst;
+ if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
+ output->ipv6.tp.flags))
+ goto nla_put_failure;
+ }
+ } else if (swkey->ip.proto == IPPROTO_UDP) {
+ struct ovs_key_udp *udp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
+ if (!nla)
+ goto nla_put_failure;
+ udp_key = nla_data(nla);
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ udp_key->udp_src = output->ipv4.tp.src;
+ udp_key->udp_dst = output->ipv4.tp.dst;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ udp_key->udp_src = output->ipv6.tp.src;
+ udp_key->udp_dst = output->ipv6.tp.dst;
+ }
+ } else if (swkey->ip.proto == IPPROTO_SCTP) {
+ struct ovs_key_sctp *sctp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
+ if (!nla)
+ goto nla_put_failure;
+ sctp_key = nla_data(nla);
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ sctp_key->sctp_src = swkey->ipv4.tp.src;
+ sctp_key->sctp_dst = swkey->ipv4.tp.dst;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ sctp_key->sctp_src = swkey->ipv6.tp.src;
+ sctp_key->sctp_dst = swkey->ipv6.tp.dst;
+ }
+ } else if (swkey->eth.type == htons(ETH_P_IP) &&
+ swkey->ip.proto == IPPROTO_ICMP) {
+ struct ovs_key_icmp *icmp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmp_key = nla_data(nla);
+ icmp_key->icmp_type = ntohs(output->ipv4.tp.src);
+ icmp_key->icmp_code = ntohs(output->ipv4.tp.dst);
+ } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
+ swkey->ip.proto == IPPROTO_ICMPV6) {
+ struct ovs_key_icmpv6 *icmpv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
+ sizeof(*icmpv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmpv6_key = nla_data(nla);
+ icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src);
+ icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst);
+
+ if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
+ icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
+ struct ovs_key_nd *nd_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
+ if (!nla)
+ goto nla_put_failure;
+ nd_key = nla_data(nla);
+ memcpy(nd_key->nd_target, &output->ipv6.nd.target,
+ sizeof(nd_key->nd_target));
+ memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN);
+ memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN);
+ }
+ }
+ }
+
+unencap:
+ if (encap)
+ nla_nest_end(skb, encap);
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+#define MAX_ACTIONS_BUFSIZE (32 * 1024)
+
+struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size)
+{
+ struct sw_flow_actions *sfa;
+
+ if (size > MAX_ACTIONS_BUFSIZE)
+ return ERR_PTR(-EINVAL);
+
+ sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
+ if (!sfa)
+ return ERR_PTR(-ENOMEM);
+
+ sfa->actions_len = 0;
+ return sfa;
+}
+
+/* RCU callback used by ovs_nla_free_flow_actions. */
+static void rcu_free_acts_callback(struct rcu_head *rcu)
+{
+ struct sw_flow_actions *sf_acts = container_of(rcu,
+ struct sw_flow_actions, rcu);
+ kfree(sf_acts);
+}
+
+/* Schedules 'sf_acts' to be freed after the next RCU grace period.
+ * The caller must hold rcu_read_lock for this to be sensible. */
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
+}
+
+static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
+ int attr_len)
+{
+
+ struct sw_flow_actions *acts;
+ int new_acts_size;
+ int req_size = NLA_ALIGN(attr_len);
+ int next_offset = offsetof(struct sw_flow_actions, actions) +
+ (*sfa)->actions_len;
+
+ if (req_size <= (ksize(*sfa) - next_offset))
+ goto out;
+
+ new_acts_size = ksize(*sfa) * 2;
+
+ if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
+ if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
+ return ERR_PTR(-EMSGSIZE);
+ new_acts_size = MAX_ACTIONS_BUFSIZE;
+ }
+
+ acts = ovs_nla_alloc_flow_actions(new_acts_size);
+ if (IS_ERR(acts))
+ return (void *)acts;
+
+ memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
+ acts->actions_len = (*sfa)->actions_len;
+ kfree(*sfa);
+ *sfa = acts;
+
+out:
+ (*sfa)->actions_len += req_size;
+ return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
+}
+
+static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
+{
+ struct nlattr *a;
+
+ a = reserve_sfa_size(sfa, nla_attr_size(len));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ a->nla_type = attrtype;
+ a->nla_len = nla_attr_size(len);
+
+ if (data)
+ memcpy(nla_data(a), data, len);
+ memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
+
+ return 0;
+}
+
+static inline int add_nested_action_start(struct sw_flow_actions **sfa,
+ int attrtype)
+{
+ int used = (*sfa)->actions_len;
+ int err;
+
+ err = add_action(sfa, attrtype, NULL, 0);
+ if (err)
+ return err;
+
+ return used;
+}
+
+static inline void add_nested_action_end(struct sw_flow_actions *sfa,
+ int st_offset)
+{
+ struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions +
+ st_offset);
+
+ a->nla_len = sfa->actions_len - st_offset;
+}
+
+static int validate_and_copy_sample(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth,
+ struct sw_flow_actions **sfa)
+{
+ const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
+ const struct nlattr *probability, *actions;
+ const struct nlattr *a;
+ int rem, start, err, st_acts;
+
+ memset(attrs, 0, sizeof(attrs));
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
+ return -EINVAL;
+ attrs[type] = a;
+ }
+ if (rem)
+ return -EINVAL;
+
+ probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
+ if (!probability || nla_len(probability) != sizeof(u32))
+ return -EINVAL;
+
+ actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
+ if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
+ return -EINVAL;
+
+ /* validation done, copy sample action. */
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE);
+ if (start < 0)
+ return start;
+ err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
+ nla_data(probability), sizeof(u32));
+ if (err)
+ return err;
+ st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS);
+ if (st_acts < 0)
+ return st_acts;
+
+ err = ovs_nla_copy_actions(actions, key, depth + 1, sfa);
+ if (err)
+ return err;
+
+ add_nested_action_end(*sfa, st_acts);
+ add_nested_action_end(*sfa, start);
+
+ return 0;
+}
+
+static int validate_tp_port(const struct sw_flow_key *flow_key)
+{
+ if (flow_key->eth.type == htons(ETH_P_IP)) {
+ if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst)
+ return 0;
+ } else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
+ if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst)
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+void ovs_match_init(struct sw_flow_match *match,
+ struct sw_flow_key *key,
+ struct sw_flow_mask *mask)
+{
+ memset(match, 0, sizeof(*match));
+ match->key = key;
+ match->mask = mask;
+
+ memset(key, 0, sizeof(*key));
+
+ if (mask) {
+ memset(&mask->key, 0, sizeof(mask->key));
+ mask->range.start = mask->range.end = 0;
+ }
+}
+
+static int validate_and_copy_set_tun(const struct nlattr *attr,
+ struct sw_flow_actions **sfa)
+{
+ struct sw_flow_match match;
+ struct sw_flow_key key;
+ int err, start;
+
+ ovs_match_init(&match, &key, NULL);
+ err = ipv4_tun_from_nlattr(nla_data(attr), &match, false);
+ if (err)
+ return err;
+
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
+ if (start < 0)
+ return start;
+
+ err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
+ sizeof(match.key->tun_key));
+ add_nested_action_end(*sfa, start);
+
+ return err;
+}
+
+static int validate_set(const struct nlattr *a,
+ const struct sw_flow_key *flow_key,
+ struct sw_flow_actions **sfa,
+ bool *set_tun)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ int key_type = nla_type(ovs_key);
+
+ /* There can be only one key in a action */
+ if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
+ return -EINVAL;
+
+ if (key_type > OVS_KEY_ATTR_MAX ||
+ (ovs_key_lens[key_type] != nla_len(ovs_key) &&
+ ovs_key_lens[key_type] != -1))
+ return -EINVAL;
+
+ switch (key_type) {
+ const struct ovs_key_ipv4 *ipv4_key;
+ const struct ovs_key_ipv6 *ipv6_key;
+ int err;
+
+ case OVS_KEY_ATTR_PRIORITY:
+ case OVS_KEY_ATTR_SKB_MARK:
+ case OVS_KEY_ATTR_ETHERNET:
+ break;
+
+ case OVS_KEY_ATTR_TUNNEL:
+ *set_tun = true;
+ err = validate_and_copy_set_tun(a, sfa);
+ if (err)
+ return err;
+ break;
+
+ case OVS_KEY_ATTR_IPV4:
+ if (flow_key->eth.type != htons(ETH_P_IP))
+ return -EINVAL;
+
+ if (!flow_key->ip.proto)
+ return -EINVAL;
+
+ ipv4_key = nla_data(ovs_key);
+ if (ipv4_key->ipv4_proto != flow_key->ip.proto)
+ return -EINVAL;
+
+ if (ipv4_key->ipv4_frag != flow_key->ip.frag)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_IPV6:
+ if (flow_key->eth.type != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ if (!flow_key->ip.proto)
+ return -EINVAL;
+
+ ipv6_key = nla_data(ovs_key);
+ if (ipv6_key->ipv6_proto != flow_key->ip.proto)
+ return -EINVAL;
+
+ if (ipv6_key->ipv6_frag != flow_key->ip.frag)
+ return -EINVAL;
+
+ if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_TCP:
+ if (flow_key->ip.proto != IPPROTO_TCP)
+ return -EINVAL;
+
+ return validate_tp_port(flow_key);
+
+ case OVS_KEY_ATTR_UDP:
+ if (flow_key->ip.proto != IPPROTO_UDP)
+ return -EINVAL;
+
+ return validate_tp_port(flow_key);
+
+ case OVS_KEY_ATTR_SCTP:
+ if (flow_key->ip.proto != IPPROTO_SCTP)
+ return -EINVAL;
+
+ return validate_tp_port(flow_key);
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int validate_userspace(const struct nlattr *attr)
+{
+ static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
+ [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
+ [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
+ };
+ struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
+ int error;
+
+ error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
+ attr, userspace_policy);
+ if (error)
+ return error;
+
+ if (!a[OVS_USERSPACE_ATTR_PID] ||
+ !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int copy_action(const struct nlattr *from,
+ struct sw_flow_actions **sfa)
+{
+ int totlen = NLA_ALIGN(from->nla_len);
+ struct nlattr *to;
+
+ to = reserve_sfa_size(sfa, from->nla_len);
+ if (IS_ERR(to))
+ return PTR_ERR(to);
+
+ memcpy(to, from, totlen);
+ return 0;
+}
+
+int ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ int depth,
+ struct sw_flow_actions **sfa)
+{
+ const struct nlattr *a;
+ int rem, err;
+
+ if (depth >= SAMPLE_ACTION_DEPTH)
+ return -EOVERFLOW;
+
+ nla_for_each_nested(a, attr, rem) {
+ /* Expected argument lengths, (u32)-1 for variable length. */
+ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
+ [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
+ [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+ [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
+ [OVS_ACTION_ATTR_POP_VLAN] = 0,
+ [OVS_ACTION_ATTR_SET] = (u32)-1,
+ [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
+ };
+ const struct ovs_action_push_vlan *vlan;
+ int type = nla_type(a);
+ bool skip_copy;
+
+ if (type > OVS_ACTION_ATTR_MAX ||
+ (action_lens[type] != nla_len(a) &&
+ action_lens[type] != (u32)-1))
+ return -EINVAL;
+
+ skip_copy = false;
+ switch (type) {
+ case OVS_ACTION_ATTR_UNSPEC:
+ return -EINVAL;
+
+ case OVS_ACTION_ATTR_USERSPACE:
+ err = validate_userspace(a);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_OUTPUT:
+ if (nla_get_u32(a) >= DP_MAX_PORTS)
+ return -EINVAL;
+ break;
+
+
+ case OVS_ACTION_ATTR_POP_VLAN:
+ break;
+
+ case OVS_ACTION_ATTR_PUSH_VLAN:
+ vlan = nla_data(a);
+ if (vlan->vlan_tpid != htons(ETH_P_8021Q))
+ return -EINVAL;
+ if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
+ return -EINVAL;
+ break;
+
+ case OVS_ACTION_ATTR_SET:
+ err = validate_set(a, key, sfa, &skip_copy);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = validate_and_copy_sample(a, key, depth, sfa);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ if (!skip_copy) {
+ err = copy_action(a, sfa);
+ if (err)
+ return err;
+ }
+ }
+
+ if (rem > 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
+{
+ const struct nlattr *a;
+ struct nlattr *start;
+ int err = 0, rem;
+
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
+ if (!start)
+ return -EMSGSIZE;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ struct nlattr *st_sample;
+
+ switch (type) {
+ case OVS_SAMPLE_ATTR_PROBABILITY:
+ if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY,
+ sizeof(u32), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ case OVS_SAMPLE_ATTR_ACTIONS:
+ st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
+ if (!st_sample)
+ return -EMSGSIZE;
+ err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
+ if (err)
+ return err;
+ nla_nest_end(skb, st_sample);
+ break;
+ }
+ }
+
+ nla_nest_end(skb, start);
+ return err;
+}
+
+static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ int key_type = nla_type(ovs_key);
+ struct nlattr *start;
+ int err;
+
+ switch (key_type) {
+ case OVS_KEY_ATTR_IPV4_TUNNEL:
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
+ if (!start)
+ return -EMSGSIZE;
+
+ err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
+ nla_data(ovs_key));
+ if (err)
+ return err;
+ nla_nest_end(skb, start);
+ break;
+ default:
+ if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
+ return -EMSGSIZE;
+ break;
+ }
+
+ return 0;
+}
+
+int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
+{
+ const struct nlattr *a;
+ int rem, err;
+
+ nla_for_each_attr(a, attr, len, rem) {
+ int type = nla_type(a);
+
+ switch (type) {
+ case OVS_ACTION_ATTR_SET:
+ err = set_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = sample_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+ default:
+ if (nla_put(skb, type, nla_len(a), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
new file mode 100644
index 000000000000..440151045d39
--- /dev/null
+++ b/net/openvswitch/flow_netlink.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+
+#ifndef FLOW_NETLINK_H
+#define FLOW_NETLINK_H 1
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+
+#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+
+#include "flow.h"
+
+void ovs_match_init(struct sw_flow_match *match,
+ struct sw_flow_key *key, struct sw_flow_mask *mask);
+
+int ovs_nla_put_flow(const struct sw_flow_key *,
+ const struct sw_flow_key *, struct sk_buff *);
+int ovs_nla_get_flow_metadata(struct sw_flow *flow,
+ const struct nlattr *attr);
+int ovs_nla_get_match(struct sw_flow_match *match,
+ const struct nlattr *,
+ const struct nlattr *);
+
+int ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth,
+ struct sw_flow_actions **sfa);
+int ovs_nla_put_actions(const struct nlattr *attr,
+ int len, struct sk_buff *skb);
+
+struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len);
+void ovs_nla_free_flow_actions(struct sw_flow_actions *);
+
+#endif /* flow_netlink.h */
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
new file mode 100644
index 000000000000..e42542706087
--- /dev/null
+++ b/net/openvswitch/flow_table.c
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include "flow.h"
+#include "datapath.h"
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/sctp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+
+#include "datapath.h"
+
+#define TBL_MIN_BUCKETS 1024
+#define REHASH_INTERVAL (10 * 60 * HZ)
+
+static struct kmem_cache *flow_cache;
+
+static u16 range_n_bytes(const struct sw_flow_key_range *range)
+{
+ return range->end - range->start;
+}
+
+void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
+ const struct sw_flow_mask *mask)
+{
+ const long *m = (long *)((u8 *)&mask->key + mask->range.start);
+ const long *s = (long *)((u8 *)src + mask->range.start);
+ long *d = (long *)((u8 *)dst + mask->range.start);
+ int i;
+
+ /* The memory outside of the 'mask->range' are not set since
+ * further operations on 'dst' only uses contents within
+ * 'mask->range'.
+ */
+ for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
+ *d++ = *s++ & *m++;
+}
+
+struct sw_flow *ovs_flow_alloc(void)
+{
+ struct sw_flow *flow;
+
+ flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&flow->lock);
+ flow->sf_acts = NULL;
+ flow->mask = NULL;
+
+ return flow;
+}
+
+int ovs_flow_tbl_count(struct flow_table *table)
+{
+ return table->count;
+}
+
+static struct flex_array *alloc_buckets(unsigned int n_buckets)
+{
+ struct flex_array *buckets;
+ int i, err;
+
+ buckets = flex_array_alloc(sizeof(struct hlist_head),
+ n_buckets, GFP_KERNEL);
+ if (!buckets)
+ return NULL;
+
+ err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
+ if (err) {
+ flex_array_free(buckets);
+ return NULL;
+ }
+
+ for (i = 0; i < n_buckets; i++)
+ INIT_HLIST_HEAD((struct hlist_head *)
+ flex_array_get(buckets, i));
+
+ return buckets;
+}
+
+static void flow_free(struct sw_flow *flow)
+{
+ kfree((struct sf_flow_acts __force *)flow->sf_acts);
+ kmem_cache_free(flow_cache, flow);
+}
+
+static void rcu_free_flow_callback(struct rcu_head *rcu)
+{
+ struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
+
+ flow_free(flow);
+}
+
+static void rcu_free_sw_flow_mask_cb(struct rcu_head *rcu)
+{
+ struct sw_flow_mask *mask = container_of(rcu, struct sw_flow_mask, rcu);
+
+ kfree(mask);
+}
+
+static void flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred)
+{
+ if (!mask)
+ return;
+
+ BUG_ON(!mask->ref_count);
+ mask->ref_count--;
+
+ if (!mask->ref_count) {
+ list_del_rcu(&mask->list);
+ if (deferred)
+ call_rcu(&mask->rcu, rcu_free_sw_flow_mask_cb);
+ else
+ kfree(mask);
+ }
+}
+
+void ovs_flow_free(struct sw_flow *flow, bool deferred)
+{
+ if (!flow)
+ return;
+
+ flow_mask_del_ref(flow->mask, deferred);
+
+ if (deferred)
+ call_rcu(&flow->rcu, rcu_free_flow_callback);
+ else
+ flow_free(flow);
+}
+
+static void free_buckets(struct flex_array *buckets)
+{
+ flex_array_free(buckets);
+}
+
+static void __table_instance_destroy(struct table_instance *ti)
+{
+ int i;
+
+ if (ti->keep_flows)
+ goto skip_flows;
+
+ for (i = 0; i < ti->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head = flex_array_get(ti->buckets, i);
+ struct hlist_node *n;
+ int ver = ti->node_ver;
+
+ hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
+ hlist_del(&flow->hash_node[ver]);
+ ovs_flow_free(flow, false);
+ }
+ }
+
+skip_flows:
+ free_buckets(ti->buckets);
+ kfree(ti);
+}
+
+static struct table_instance *table_instance_alloc(int new_size)
+{
+ struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+
+ if (!ti)
+ return NULL;
+
+ ti->buckets = alloc_buckets(new_size);
+
+ if (!ti->buckets) {
+ kfree(ti);
+ return NULL;
+ }
+ ti->n_buckets = new_size;
+ ti->node_ver = 0;
+ ti->keep_flows = false;
+ get_random_bytes(&ti->hash_seed, sizeof(u32));
+
+ return ti;
+}
+
+int ovs_flow_tbl_init(struct flow_table *table)
+{
+ struct table_instance *ti;
+
+ ti = table_instance_alloc(TBL_MIN_BUCKETS);
+
+ if (!ti)
+ return -ENOMEM;
+
+ rcu_assign_pointer(table->ti, ti);
+ INIT_LIST_HEAD(&table->mask_list);
+ table->last_rehash = jiffies;
+ table->count = 0;
+ return 0;
+}
+
+static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
+{
+ struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
+
+ __table_instance_destroy(ti);
+}
+
+static void table_instance_destroy(struct table_instance *ti, bool deferred)
+{
+ if (!ti)
+ return;
+
+ if (deferred)
+ call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
+ else
+ __table_instance_destroy(ti);
+}
+
+void ovs_flow_tbl_destroy(struct flow_table *table)
+{
+ struct table_instance *ti = ovsl_dereference(table->ti);
+
+ table_instance_destroy(ti, false);
+}
+
+struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
+ u32 *bucket, u32 *last)
+{
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ int ver;
+ int i;
+
+ ver = ti->node_ver;
+ while (*bucket < ti->n_buckets) {
+ i = 0;
+ head = flex_array_get(ti->buckets, *bucket);
+ hlist_for_each_entry_rcu(flow, head, hash_node[ver]) {
+ if (i < *last) {
+ i++;
+ continue;
+ }
+ *last = i + 1;
+ return flow;
+ }
+ (*bucket)++;
+ *last = 0;
+ }
+
+ return NULL;
+}
+
+static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
+{
+ hash = jhash_1word(hash, ti->hash_seed);
+ return flex_array_get(ti->buckets,
+ (hash & (ti->n_buckets - 1)));
+}
+
+static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow)
+{
+ struct hlist_head *head;
+
+ head = find_bucket(ti, flow->hash);
+ hlist_add_head_rcu(&flow->hash_node[ti->node_ver], head);
+}
+
+static void flow_table_copy_flows(struct table_instance *old,
+ struct table_instance *new)
+{
+ int old_ver;
+ int i;
+
+ old_ver = old->node_ver;
+ new->node_ver = !old_ver;
+
+ /* Insert in new table. */
+ for (i = 0; i < old->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head;
+
+ head = flex_array_get(old->buckets, i);
+
+ hlist_for_each_entry(flow, head, hash_node[old_ver])
+ table_instance_insert(new, flow);
+ }
+
+ old->keep_flows = true;
+}
+
+static struct table_instance *table_instance_rehash(struct table_instance *ti,
+ int n_buckets)
+{
+ struct table_instance *new_ti;
+
+ new_ti = table_instance_alloc(n_buckets);
+ if (!new_ti)
+ return NULL;
+
+ flow_table_copy_flows(ti, new_ti);
+
+ return new_ti;
+}
+
+int ovs_flow_tbl_flush(struct flow_table *flow_table)
+{
+ struct table_instance *old_ti;
+ struct table_instance *new_ti;
+
+ old_ti = ovsl_dereference(flow_table->ti);
+ new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ if (!new_ti)
+ return -ENOMEM;
+
+ rcu_assign_pointer(flow_table->ti, new_ti);
+ flow_table->last_rehash = jiffies;
+ flow_table->count = 0;
+
+ table_instance_destroy(old_ti, true);
+ return 0;
+}
+
+static u32 flow_hash(const struct sw_flow_key *key, int key_start,
+ int key_end)
+{
+ u32 *hash_key = (u32 *)((u8 *)key + key_start);
+ int hash_u32s = (key_end - key_start) >> 2;
+
+ /* Make sure number of hash bytes are multiple of u32. */
+ BUILD_BUG_ON(sizeof(long) % sizeof(u32));
+
+ return jhash2(hash_key, hash_u32s, 0);
+}
+
+static int flow_key_start(const struct sw_flow_key *key)
+{
+ if (key->tun_key.ipv4_dst)
+ return 0;
+ else
+ return rounddown(offsetof(struct sw_flow_key, phy),
+ sizeof(long));
+}
+
+static bool cmp_key(const struct sw_flow_key *key1,
+ const struct sw_flow_key *key2,
+ int key_start, int key_end)
+{
+ const long *cp1 = (long *)((u8 *)key1 + key_start);
+ const long *cp2 = (long *)((u8 *)key2 + key_start);
+ long diffs = 0;
+ int i;
+
+ for (i = key_start; i < key_end; i += sizeof(long))
+ diffs |= *cp1++ ^ *cp2++;
+
+ return diffs == 0;
+}
+
+static bool flow_cmp_masked_key(const struct sw_flow *flow,
+ const struct sw_flow_key *key,
+ int key_start, int key_end)
+{
+ return cmp_key(&flow->key, key, key_start, key_end);
+}
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+ struct sw_flow_match *match)
+{
+ struct sw_flow_key *key = match->key;
+ int key_start = flow_key_start(key);
+ int key_end = match->range.end;
+
+ return cmp_key(&flow->unmasked_key, key, key_start, key_end);
+}
+
+static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
+ const struct sw_flow_key *unmasked,
+ struct sw_flow_mask *mask)
+{
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ int key_start = mask->range.start;
+ int key_end = mask->range.end;
+ u32 hash;
+ struct sw_flow_key masked_key;
+
+ ovs_flow_mask_key(&masked_key, unmasked, mask);
+ hash = flow_hash(&masked_key, key_start, key_end);
+ head = find_bucket(ti, hash);
+ hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) {
+ if (flow->mask == mask && flow->hash == hash &&
+ flow_cmp_masked_key(flow, &masked_key,
+ key_start, key_end))
+ return flow;
+ }
+ return NULL;
+}
+
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
+ const struct sw_flow_key *key,
+ u32 *n_mask_hit)
+{
+ struct table_instance *ti = rcu_dereference(tbl->ti);
+ struct sw_flow_mask *mask;
+ struct sw_flow *flow;
+
+ *n_mask_hit = 0;
+ list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
+ (*n_mask_hit)++;
+ flow = masked_flow_lookup(ti, key, mask);
+ if (flow) /* Found */
+ return flow;
+ }
+ return NULL;
+}
+
+int ovs_flow_tbl_num_masks(const struct flow_table *table)
+{
+ struct sw_flow_mask *mask;
+ int num = 0;
+
+ list_for_each_entry(mask, &table->mask_list, list)
+ num++;
+
+ return num;
+}
+
+static struct table_instance *table_instance_expand(struct table_instance *ti)
+{
+ return table_instance_rehash(ti, ti->n_buckets * 2);
+}
+
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
+{
+ struct table_instance *ti = ovsl_dereference(table->ti);
+
+ BUG_ON(table->count == 0);
+ hlist_del_rcu(&flow->hash_node[ti->node_ver]);
+ table->count--;
+}
+
+static struct sw_flow_mask *mask_alloc(void)
+{
+ struct sw_flow_mask *mask;
+
+ mask = kmalloc(sizeof(*mask), GFP_KERNEL);
+ if (mask)
+ mask->ref_count = 0;
+
+ return mask;
+}
+
+static void mask_add_ref(struct sw_flow_mask *mask)
+{
+ mask->ref_count++;
+}
+
+static bool mask_equal(const struct sw_flow_mask *a,
+ const struct sw_flow_mask *b)
+{
+ u8 *a_ = (u8 *)&a->key + a->range.start;
+ u8 *b_ = (u8 *)&b->key + b->range.start;
+
+ return (a->range.end == b->range.end)
+ && (a->range.start == b->range.start)
+ && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
+}
+
+static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
+ const struct sw_flow_mask *mask)
+{
+ struct list_head *ml;
+
+ list_for_each(ml, &tbl->mask_list) {
+ struct sw_flow_mask *m;
+ m = container_of(ml, struct sw_flow_mask, list);
+ if (mask_equal(mask, m))
+ return m;
+ }
+
+ return NULL;
+}
+
+/**
+ * add a new mask into the mask list.
+ * The caller needs to make sure that 'mask' is not the same
+ * as any masks that are already on the list.
+ */
+static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
+ struct sw_flow_mask *new)
+{
+ struct sw_flow_mask *mask;
+ mask = flow_mask_find(tbl, new);
+ if (!mask) {
+ /* Allocate a new mask if none exsits. */
+ mask = mask_alloc();
+ if (!mask)
+ return -ENOMEM;
+ mask->key = new->key;
+ mask->range = new->range;
+ list_add_rcu(&mask->list, &tbl->mask_list);
+ }
+
+ mask_add_ref(mask);
+ flow->mask = mask;
+ return 0;
+}
+
+int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
+ struct sw_flow_mask *mask)
+{
+ struct table_instance *new_ti = NULL;
+ struct table_instance *ti;
+ int err;
+
+ err = flow_mask_insert(table, flow, mask);
+ if (err)
+ return err;
+
+ flow->hash = flow_hash(&flow->key, flow->mask->range.start,
+ flow->mask->range.end);
+ ti = ovsl_dereference(table->ti);
+ table_instance_insert(ti, flow);
+ table->count++;
+
+ /* Expand table, if necessary, to make room. */
+ if (table->count > ti->n_buckets)
+ new_ti = table_instance_expand(ti);
+ else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
+ new_ti = table_instance_rehash(ti, ti->n_buckets);
+
+ if (new_ti) {
+ rcu_assign_pointer(table->ti, new_ti);
+ table_instance_destroy(ti, true);
+ table->last_rehash = jiffies;
+ }
+ return 0;
+}
+
+/* Initializes the flow module.
+ * Returns zero if successful or a negative error code. */
+int ovs_flow_init(void)
+{
+ BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
+ BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
+
+ flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
+ 0, NULL);
+ if (flow_cache == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Uninitializes the flow module. */
+void ovs_flow_exit(void)
+{
+ kmem_cache_destroy(flow_cache);
+}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
new file mode 100644
index 000000000000..fbe45d5ad07d
--- /dev/null
+++ b/net/openvswitch/flow_table.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef FLOW_TABLE_H
+#define FLOW_TABLE_H 1
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+
+#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+
+#include "flow.h"
+
+struct table_instance {
+ struct flex_array *buckets;
+ unsigned int n_buckets;
+ struct rcu_head rcu;
+ int node_ver;
+ u32 hash_seed;
+ bool keep_flows;
+};
+
+struct flow_table {
+ struct table_instance __rcu *ti;
+ struct list_head mask_list;
+ unsigned long last_rehash;
+ unsigned int count;
+};
+
+int ovs_flow_init(void);
+void ovs_flow_exit(void);
+
+struct sw_flow *ovs_flow_alloc(void);
+void ovs_flow_free(struct sw_flow *, bool deferred);
+
+int ovs_flow_tbl_init(struct flow_table *);
+int ovs_flow_tbl_count(struct flow_table *table);
+void ovs_flow_tbl_destroy(struct flow_table *table);
+int ovs_flow_tbl_flush(struct flow_table *flow_table);
+
+int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
+ struct sw_flow_mask *mask);
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
+int ovs_flow_tbl_num_masks(const struct flow_table *table);
+struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table,
+ u32 *bucket, u32 *idx);
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
+ const struct sw_flow_key *,
+ u32 *n_mask_hit);
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+ struct sw_flow_match *match);
+
+void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
+ const struct sw_flow_mask *mask);
+#endif /* flow_table.h */
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index c99dea543d64..a3d6951602db 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -24,8 +24,6 @@
#include <linux/if_tunnel.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
-#include <linux/if_vlan.h>
-#include <linux/in.h>
#include <linux/in_route.h>
#include <linux/inetdevice.h>
#include <linux/jhash.h>
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 98d3edbbc235..729c68763fe7 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -134,7 +134,7 @@ static void do_setup(struct net_device *netdev)
netdev->tx_queue_len = 0;
netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
- NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
+ NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
netdev->vlan_features = netdev->features;
netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 09d93c13cfd6..d21f77d875ba 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -150,15 +150,25 @@ static void free_port_rcu(struct rcu_head *rcu)
ovs_vport_free(vport_from_priv(netdev_vport));
}
-static void netdev_destroy(struct vport *vport)
+void ovs_netdev_detach_dev(struct vport *vport)
{
struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- rtnl_lock();
+ ASSERT_RTNL();
netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
netdev_rx_handler_unregister(netdev_vport->dev);
- netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
+ netdev_upper_dev_unlink(netdev_vport->dev,
+ netdev_master_upper_dev_get(netdev_vport->dev));
dev_set_promiscuity(netdev_vport->dev, -1);
+}
+
+static void netdev_destroy(struct vport *vport)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+
+ rtnl_lock();
+ if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ ovs_netdev_detach_dev(vport);
rtnl_unlock();
call_rcu(&netdev_vport->rcu, free_port_rcu);
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
index dd298b5c5cdb..8df01c1127e5 100644
--- a/net/openvswitch/vport-netdev.h
+++ b/net/openvswitch/vport-netdev.h
@@ -39,5 +39,6 @@ netdev_vport_priv(const struct vport *vport)
}
const char *ovs_netdev_get_name(const struct vport *);
+void ovs_netdev_detach_dev(struct vport *);
#endif /* vport_netdev.h */
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 56e22b74cf96..e797a50ac2be 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -29,7 +29,6 @@
#include <net/ip.h>
#include <net/udp.h>
#include <net/ip_tunnels.h>
-#include <net/udp.h>
#include <net/rtnetlink.h>
#include <net/route.h>
#include <net/dsfield.h>
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index a9dfdda9ed1d..fdc041c57853 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -255,6 +255,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
+ f->time_next_packet = 0ULL;
}
return f;
}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index f6334aa19151..7567e6f1a920 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -279,7 +279,9 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port));
rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
- if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC))
+ if (!laddr->valid || laddr->state == SCTP_ADDR_DEL ||
+ (laddr->state != SCTP_ADDR_SRC &&
+ !asoc->src_out_of_asoc_ok))
continue;
/* Do not compare against v4 addrs */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 666c66842799..1a6eef39ab2f 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -860,7 +860,6 @@ static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds,
(!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK))
return;
- BUG_ON(asoc->peer.primary_path == NULL);
sctp_unhash_established(asoc);
sctp_association_free(asoc);
}
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index c959312c45e3..e2fa133f9fba 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -16,8 +16,8 @@ config X25
if you want that) and the lower level data link layer protocol LAPB
(say Y to "LAPB Data Link Driver" below if you want that).
- You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
- <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
+ You can read more about X.25 at <http://www.sangoma.com/tutorials/x25/> and
+ <http://docwiki.cisco.com/wiki/X.25>.
Information about X.25 for Linux is contained in the files
<file:Documentation/networking/x25.txt> and
<file:Documentation/networking/x25-iface.txt>.
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index b943c7fc5ed2..ccfdc7115a83 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -141,14 +141,14 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
const int plen = skb->len;
int dlen = IPCOMP_SCRATCH_SIZE;
u8 *start = skb->data;
- const int cpu = get_cpu();
- u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
- struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+ struct crypto_comp *tfm;
+ u8 *scratch;
int err;
local_bh_disable();
+ scratch = *this_cpu_ptr(ipcomp_scratches);
+ tfm = *this_cpu_ptr(ipcd->tfms);
err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
- local_bh_enable();
if (err)
goto out;
@@ -158,13 +158,13 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
}
memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
- put_cpu();
+ local_bh_enable();
pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr));
return 0;
out:
- put_cpu();
+ local_bh_enable();
return err;
}