summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/bridge/br_device.c2
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_multicast.c44
-rw-r--r--net/bridge/br_netfilter.c2
-rw-r--r--net/bridge/br_private.h5
-rw-r--r--net/bridge/netfilter/Kconfig1
-rw-r--r--net/bridge/netfilter/ebt_ulog.c9
-rw-r--r--net/bridge/netfilter/nf_tables_bridge.c41
-rw-r--r--net/core/dev.c22
-rw-r--r--net/core/flow_dissector.c2
-rw-r--r--net/core/net-sysfs.c2
-rw-r--r--net/core/netpoll.c31
-rw-r--r--net/core/skbuff.c31
-rw-r--r--net/hsr/Kconfig27
-rw-r--r--net/hsr/Makefile7
-rw-r--r--net/hsr/hsr_device.c596
-rw-r--r--net/hsr/hsr_device.h29
-rw-r--r--net/hsr/hsr_framereg.c503
-rw-r--r--net/hsr/hsr_framereg.h53
-rw-r--r--net/hsr/hsr_main.c469
-rw-r--r--net/hsr/hsr_main.h166
-rw-r--r--net/hsr/hsr_netlink.c457
-rw-r--r--net/hsr/hsr_netlink.h30
-rw-r--r--net/ieee802154/6lowpan.c32
-rw-r--r--net/ipv4/esp4.c49
-rw-r--r--net/ipv4/netfilter/arp_tables.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c110
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c7
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp_bic.c5
-rw-r--r--net/ipv4/tcp_cong.c47
-rw-r--r--net/ipv4/tcp_cubic.c5
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_highspeed.c4
-rw-r--r--net/ipv4/tcp_htcp.c4
-rw-r--r--net/ipv4/tcp_hybla.c5
-rw-r--r--net/ipv4/tcp_illinois.c5
-rw-r--r--net/ipv4/tcp_input.c40
-rw-r--r--net/ipv4/tcp_lp.c5
-rw-r--r--net/ipv4/tcp_offload.c13
-rw-r--r--net/ipv4/tcp_scalable.c5
-rw-r--r--net/ipv4/tcp_vegas.c11
-rw-r--r--net/ipv4/tcp_veno.c9
-rw-r--r--net/ipv4/tcp_yeah.c5
-rw-r--r--net/ipv4/xfrm4_policy.c8
-rw-r--r--net/ipv6/esp6.c48
-rw-r--r--net/ipv6/netfilter/ip6_tables.c5
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c7
-rw-r--r--net/ipv6/route.c9
-rw-r--r--net/ipv6/xfrm6_policy.c8
-rw-r--r--net/mac802154/wpan.c2
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h11
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c2
-rw-r--r--net/netfilter/ipset/ip_set_core.c70
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h21
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c22
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c22
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c48
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c39
-rw-r--r--net/netfilter/nf_conntrack_acct.c12
-rw-r--r--net/netfilter/nf_conntrack_core.c16
-rw-r--r--net/netfilter/nf_conntrack_netlink.c51
-rw-r--r--net/netfilter/nft_compat.c8
-rw-r--r--net/netfilter/nft_nat.c12
-rw-r--r--net/netfilter/x_tables.c7
-rw-r--r--net/netfilter/xt_NFQUEUE.c7
-rw-r--r--net/netfilter/xt_connbytes.c6
-rw-r--r--net/netfilter/xt_socket.c13
-rw-r--r--net/openvswitch/Makefile2
-rw-r--r--net/openvswitch/datapath.c668
-rw-r--r--net/openvswitch/datapath.h9
-rw-r--r--net/openvswitch/dp_notify.c7
-rw-r--r--net/openvswitch/flow.c1605
-rw-r--r--net/openvswitch/flow.h132
-rw-r--r--net/openvswitch/flow_netlink.c1630
-rw-r--r--net/openvswitch/flow_netlink.h60
-rw-r--r--net/openvswitch/flow_table.c592
-rw-r--r--net/openvswitch/flow_table.h81
-rw-r--r--net/openvswitch/vport-gre.c2
-rw-r--r--net/openvswitch/vport-internal_dev.c2
-rw-r--r--net/openvswitch/vport-netdev.c16
-rw-r--r--net/openvswitch/vport-netdev.h1
-rw-r--r--net/openvswitch/vport-vxlan.c1
-rw-r--r--net/sched/sch_fq.c1
-rw-r--r--net/sctp/ipv6.c4
-rw-r--r--net/sctp/output.c9
-rw-r--r--net/sctp/sm_sideeffect.c1
-rw-r--r--net/x25/Kconfig4
-rw-r--r--net/xfrm/xfrm_ipcomp.c18
-rw-r--r--net/xfrm/xfrm_policy.c7
98 files changed, 5471 insertions, 2770 deletions
diff --git a/net/Kconfig b/net/Kconfig
index b50dacc072f0..0715db64a5c3 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -220,6 +220,7 @@ source "net/openvswitch/Kconfig"
source "net/vmw_vsock/Kconfig"
source "net/netlink/Kconfig"
source "net/mpls/Kconfig"
+source "net/hsr/Kconfig"
config RPS
boolean
diff --git a/net/Makefile b/net/Makefile
index 9492e8cb64e9..8fa2f91517f1 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -71,3 +71,4 @@ obj-$(CONFIG_NFC) += nfc/
obj-$(CONFIG_OPENVSWITCH) += openvswitch/
obj-$(CONFIG_VSOCKETS) += vmw_vsock/
obj-$(CONFIG_NET_MPLS_GSO) += mpls/
+obj-$(CONFIG_HSR) += hsr/
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ca04163635da..e6b7fecb3af1 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -64,7 +64,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_flood_deliver(br, skb, false);
goto out;
}
- if (br_multicast_rcv(br, NULL, skb)) {
+ if (br_multicast_rcv(br, NULL, skb, vid)) {
kfree_skb(skb);
goto out;
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index a2fd37ec35f7..7e73c32e205d 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -80,7 +80,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
br_fdb_update(br, p, eth_hdr(skb)->h_source, vid);
if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
- br_multicast_rcv(br, p, skb))
+ br_multicast_rcv(br, p, skb, vid))
goto drop;
if (p->state == BR_STATE_LEARNING)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 0513ef3ce667..4c214b2b88ef 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -947,7 +947,8 @@ void br_multicast_disable_port(struct net_bridge_port *port)
static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
struct igmpv3_report *ih;
struct igmpv3_grec *grec;
@@ -957,12 +958,10 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
int type;
int err = 0;
__be32 group;
- u16 vid = 0;
if (!pskb_may_pull(skb, sizeof(*ih)))
return -EINVAL;
- br_vlan_get_tag(skb, &vid);
ih = igmpv3_report_hdr(skb);
num = ntohs(ih->ngrec);
len = sizeof(*ih);
@@ -1005,7 +1004,8 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
#if IS_ENABLED(CONFIG_IPV6)
static int br_ip6_multicast_mld2_report(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
struct icmp6hdr *icmp6h;
struct mld2_grec *grec;
@@ -1013,12 +1013,10 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
int len;
int num;
int err = 0;
- u16 vid = 0;
if (!pskb_may_pull(skb, sizeof(*icmp6h)))
return -EINVAL;
- br_vlan_get_tag(skb, &vid);
icmp6h = icmp6_hdr(skb);
num = ntohs(icmp6h->icmp6_dataun.un_data16[1]);
len = sizeof(*icmp6h);
@@ -1141,7 +1139,8 @@ static void br_multicast_query_received(struct net_bridge *br,
static int br_ip4_multicast_query(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
const struct iphdr *iph = ip_hdr(skb);
struct igmphdr *ih = igmp_hdr(skb);
@@ -1153,7 +1152,6 @@ static int br_ip4_multicast_query(struct net_bridge *br,
unsigned long now = jiffies;
__be32 group;
int err = 0;
- u16 vid = 0;
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) ||
@@ -1189,7 +1187,6 @@ static int br_ip4_multicast_query(struct net_bridge *br,
if (!group)
goto out;
- br_vlan_get_tag(skb, &vid);
mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid);
if (!mp)
goto out;
@@ -1219,7 +1216,8 @@ out:
#if IS_ENABLED(CONFIG_IPV6)
static int br_ip6_multicast_query(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct mld_msg *mld;
@@ -1231,7 +1229,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
unsigned long now = jiffies;
const struct in6_addr *group = NULL;
int err = 0;
- u16 vid = 0;
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) ||
@@ -1265,7 +1262,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
if (!group)
goto out;
- br_vlan_get_tag(skb, &vid);
mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid);
if (!mp)
goto out;
@@ -1439,7 +1435,8 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
static int br_multicast_ipv4_rcv(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
struct sk_buff *skb2 = skb;
const struct iphdr *iph;
@@ -1447,7 +1444,6 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
unsigned int len;
unsigned int offset;
int err;
- u16 vid = 0;
/* We treat OOM as packet loss for now. */
if (!pskb_may_pull(skb, sizeof(*iph)))
@@ -1508,7 +1504,6 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
err = 0;
- br_vlan_get_tag(skb2, &vid);
BR_INPUT_SKB_CB(skb)->igmp = 1;
ih = igmp_hdr(skb2);
@@ -1519,10 +1514,10 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
err = br_ip4_multicast_add_group(br, port, ih->group, vid);
break;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
- err = br_ip4_multicast_igmp3_report(br, port, skb2);
+ err = br_ip4_multicast_igmp3_report(br, port, skb2, vid);
break;
case IGMP_HOST_MEMBERSHIP_QUERY:
- err = br_ip4_multicast_query(br, port, skb2);
+ err = br_ip4_multicast_query(br, port, skb2, vid);
break;
case IGMP_HOST_LEAVE_MESSAGE:
br_ip4_multicast_leave_group(br, port, ih->group, vid);
@@ -1540,7 +1535,8 @@ err_out:
#if IS_ENABLED(CONFIG_IPV6)
static int br_multicast_ipv6_rcv(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
struct sk_buff *skb2;
const struct ipv6hdr *ip6h;
@@ -1550,7 +1546,6 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
unsigned int len;
int offset;
int err;
- u16 vid = 0;
if (!pskb_may_pull(skb, sizeof(*ip6h)))
return -EINVAL;
@@ -1640,7 +1635,6 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
err = 0;
- br_vlan_get_tag(skb, &vid);
BR_INPUT_SKB_CB(skb)->igmp = 1;
switch (icmp6_type) {
@@ -1657,10 +1651,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
break;
}
case ICMPV6_MLD2_REPORT:
- err = br_ip6_multicast_mld2_report(br, port, skb2);
+ err = br_ip6_multicast_mld2_report(br, port, skb2, vid);
break;
case ICMPV6_MGM_QUERY:
- err = br_ip6_multicast_query(br, port, skb2);
+ err = br_ip6_multicast_query(br, port, skb2, vid);
break;
case ICMPV6_MGM_REDUCTION:
{
@@ -1681,7 +1675,7 @@ out:
#endif
int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb, u16 vid)
{
BR_INPUT_SKB_CB(skb)->igmp = 0;
BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
@@ -1691,10 +1685,10 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
switch (skb->protocol) {
case htons(ETH_P_IP):
- return br_multicast_ipv4_rcv(br, port, skb);
+ return br_multicast_ipv4_rcv(br, port, skb, vid);
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- return br_multicast_ipv6_rcv(br, port, skb);
+ return br_multicast_ipv6_rcv(br, port, skb, vid);
#endif
}
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 878f008afefa..80cad2cf02a7 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -559,6 +559,8 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
else if (skb->protocol == htons(ETH_P_PPP_SES))
nf_bridge->mask |= BRNF_PPPoE;
+ /* Must drop socket now because of tproxy. */
+ skb_orphan(skb);
return skb->dev;
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d1ca6d956633..229d820bdf0b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -435,7 +435,7 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
extern unsigned int br_mdb_rehash_seq;
int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
- struct sk_buff *skb);
+ struct sk_buff *skb, u16 vid);
struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
struct sk_buff *skb, u16 vid);
void br_multicast_add_port(struct net_bridge_port *port);
@@ -504,7 +504,8 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br,
#else
static inline int br_multicast_rcv(struct net_bridge *br,
struct net_bridge_port *port,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u16 vid)
{
return 0;
}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 68f8128147be..5ca74a0e595f 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -3,6 +3,7 @@
#
#
config NF_TABLES_BRIDGE
+ depends on NF_TABLES
tristate "Ethernet Bridge nf_tables support"
menuconfig BRIDGE_NF_EBTABLES
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 518093802d1d..7c470c371e14 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -181,6 +181,7 @@ static void ebt_ulog_packet(struct net *net, unsigned int hooknr,
ub->qlen++;
pm = nlmsg_data(nlh);
+ memset(pm, 0, sizeof(*pm));
/* Fill in the ulog data */
pm->version = EBT_ULOG_VERSION;
@@ -193,8 +194,6 @@ static void ebt_ulog_packet(struct net *net, unsigned int hooknr,
pm->hook = hooknr;
if (uloginfo->prefix != NULL)
strcpy(pm->prefix, uloginfo->prefix);
- else
- *(pm->prefix) = '\0';
if (in) {
strcpy(pm->physindev, in->name);
@@ -204,16 +203,14 @@ static void ebt_ulog_packet(struct net *net, unsigned int hooknr,
strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name);
else
strcpy(pm->indev, in->name);
- } else
- pm->indev[0] = pm->physindev[0] = '\0';
+ }
if (out) {
/* If out exists, then out is a bridge port */
strcpy(pm->physoutdev, out->name);
/* rcu_read_lock()ed by nf_hook_slow */
strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name);
- } else
- pm->outdev[0] = pm->physoutdev[0] = '\0';
+ }
if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0)
BUG();
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index e8cb016fa34d..cf54b22818c8 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -47,14 +48,50 @@ static struct pernet_operations nf_tables_bridge_net_ops = {
.exit = nf_tables_bridge_exit_net,
};
+static unsigned int
+nft_do_chain_bridge(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo(&pkt, ops, skb, in, out);
+
+ return nft_do_chain_pktinfo(&pkt, ops);
+}
+
+static struct nf_chain_type filter_bridge = {
+ .family = NFPROTO_BRIDGE,
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .hook_mask = (1 << NF_BR_LOCAL_IN) |
+ (1 << NF_BR_FORWARD) |
+ (1 << NF_BR_LOCAL_OUT),
+ .fn = {
+ [NF_BR_LOCAL_IN] = nft_do_chain_bridge,
+ [NF_BR_FORWARD] = nft_do_chain_bridge,
+ [NF_BR_LOCAL_OUT] = nft_do_chain_bridge,
+ },
+};
+
static int __init nf_tables_bridge_init(void)
{
- return register_pernet_subsys(&nf_tables_bridge_net_ops);
+ int ret;
+
+ nft_register_chain_type(&filter_bridge);
+ ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_bridge);
+
+ return ret;
}
static void __exit nf_tables_bridge_exit(void)
{
- return unregister_pernet_subsys(&nf_tables_bridge_net_ops);
+ unregister_pernet_subsys(&nf_tables_bridge_net_ops);
+ nft_unregister_chain_type(&filter_bridge);
}
module_init(nf_tables_bridge_init);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0054c8c75f50..0e6136546a8c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6196,6 +6196,16 @@ void netdev_set_default_ethtool_ops(struct net_device *dev,
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
+void netdev_freemem(struct net_device *dev)
+{
+ char *addr = (char *)dev - dev->padded;
+
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+
/**
* alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
@@ -6239,7 +6249,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
- p = kzalloc(alloc_size, GFP_KERNEL);
+ p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!p)
+ p = vzalloc(alloc_size);
if (!p)
return NULL;
@@ -6248,7 +6260,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
- goto free_p;
+ goto free_dev;
if (dev_addr_init(dev))
goto free_pcpu;
@@ -6301,8 +6313,8 @@ free_pcpu:
kfree(dev->_rx);
#endif
-free_p:
- kfree(p);
+free_dev:
+ netdev_freemem(dev);
return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);
@@ -6339,7 +6351,7 @@ void free_netdev(struct net_device *dev)
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
- kfree((char *)dev - dev->padded);
+ netdev_freemem(dev);
return;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5cac36e6ccd1..0242035192f1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -66,7 +66,7 @@ again:
struct iphdr _iph;
ip:
iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
- if (!iph)
+ if (!iph || iph->ihl < 5)
return false;
if (ip_is_fragment(iph))
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d954b56b4e47..d03f2c9750fa 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1263,7 +1263,7 @@ static void netdev_release(struct device *d)
BUG_ON(dev->reg_state != NETREG_RELEASED);
kfree(dev->ifalias);
- kfree((char *)dev - dev->padded);
+ netdev_freemem(dev);
}
static const void *net_namespace(struct device *d)
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index fc75c9e461b8..8f971990677c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -636,8 +636,9 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo
netpoll_send_skb(np, send_skb);
- /* If there are several rx_hooks for the same address,
- we're fine by sending a single reply */
+ /* If there are several rx_skb_hooks for the same
+ * address we're fine by sending a single reply
+ */
break;
}
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
@@ -719,8 +720,9 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo
netpoll_send_skb(np, send_skb);
- /* If there are several rx_hooks for the same address,
- we're fine by sending a single reply */
+ /* If there are several rx_skb_hooks for the same
+ * address, we're fine by sending a single reply
+ */
break;
}
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
@@ -756,11 +758,12 @@ static bool pkt_is_ns(struct sk_buff *skb)
int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
{
- int proto, len, ulen;
- int hits = 0;
+ int proto, len, ulen, data_len;
+ int hits = 0, offset;
const struct iphdr *iph;
struct udphdr *uh;
struct netpoll *np, *tmp;
+ uint16_t source;
if (list_empty(&npinfo->rx_np))
goto out;
@@ -820,7 +823,10 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
len -= iph->ihl*4;
uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
+ offset = (unsigned char *)(uh + 1) - skb->data;
ulen = ntohs(uh->len);
+ data_len = skb->len - offset;
+ source = ntohs(uh->source);
if (ulen != len)
goto out;
@@ -834,9 +840,7 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
if (np->local_port && np->local_port != ntohs(uh->dest))
continue;
- np->rx_hook(np, ntohs(uh->source),
- (char *)(uh+1),
- ulen - sizeof(struct udphdr));
+ np->rx_skb_hook(np, source, skb, offset, data_len);
hits++;
}
} else {
@@ -859,7 +863,10 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
uh = udp_hdr(skb);
+ offset = (unsigned char *)(uh + 1) - skb->data;
ulen = ntohs(uh->len);
+ data_len = skb->len - offset;
+ source = ntohs(uh->source);
if (ulen != skb->len)
goto out;
if (udp6_csum_init(skb, uh, IPPROTO_UDP))
@@ -872,9 +879,7 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
if (np->local_port && np->local_port != ntohs(uh->dest))
continue;
- np->rx_hook(np, ntohs(uh->source),
- (char *)(uh+1),
- ulen - sizeof(struct udphdr));
+ np->rx_skb_hook(np, source, skb, offset, data_len);
hits++;
}
#endif
@@ -1062,7 +1067,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
npinfo->netpoll = np;
- if (np->rx_hook) {
+ if (np->rx_skb_hook) {
spin_lock_irqsave(&npinfo->rx_lock, flags);
npinfo->rx_flags |= NETPOLL_RX_ENABLED;
list_add_tail(&np->rx, &npinfo->rx_np);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0ab32faa520f..e4115597b38b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1928,9 +1928,8 @@ fault:
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-
-__wsum skb_checksum(const struct sk_buff *skb, int offset,
- int len, __wsum csum)
+__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
+ __wsum csum, const struct skb_checksum_ops *ops)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -1941,7 +1940,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = csum_partial(skb->data + offset, copy, csum);
+ csum = ops->update(skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -1962,10 +1961,10 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
vaddr = kmap_atomic(skb_frag_page(frag));
- csum2 = csum_partial(vaddr + frag->page_offset +
- offset - start, copy, 0);
+ csum2 = ops->update(vaddr + frag->page_offset +
+ offset - start, copy, 0);
kunmap_atomic(vaddr);
- csum = csum_block_add(csum, csum2, pos);
+ csum = ops->combine(csum, csum2, pos, copy);
if (!(len -= copy))
return csum;
offset += copy;
@@ -1984,9 +1983,9 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
__wsum csum2;
if (copy > len)
copy = len;
- csum2 = skb_checksum(frag_iter, offset - start,
- copy, 0);
- csum = csum_block_add(csum, csum2, pos);
+ csum2 = __skb_checksum(frag_iter, offset - start,
+ copy, 0, ops);
+ csum = ops->combine(csum, csum2, pos, copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -1998,6 +1997,18 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
return csum;
}
+EXPORT_SYMBOL(__skb_checksum);
+
+__wsum skb_checksum(const struct sk_buff *skb, int offset,
+ int len, __wsum csum)
+{
+ const struct skb_checksum_ops ops = {
+ .update = csum_partial_ext,
+ .combine = csum_block_add_ext,
+ };
+
+ return __skb_checksum(skb, offset, len, csum, &ops);
+}
EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig
new file mode 100644
index 000000000000..0d3d709052ca
--- /dev/null
+++ b/net/hsr/Kconfig
@@ -0,0 +1,27 @@
+#
+# IEC 62439-3 High-availability Seamless Redundancy
+#
+
+config HSR
+ tristate "High-availability Seamless Redundancy (HSR)"
+ ---help---
+ If you say Y here, then your Linux box will be able to act as a
+ DANH ("Doubly attached node implementing HSR"). For this to work,
+ your Linux box needs (at least) two physical Ethernet interfaces,
+ and it must be connected as a node in a ring network together with
+ other HSR capable nodes.
+
+ All Ethernet frames sent over the hsr device will be sent in both
+ directions on the ring (over both slave ports), giving a redundant,
+ instant fail-over network. Each HSR node in the ring acts like a
+ bridge for HSR frames, but filters frames that have been forwarded
+ earlier.
+
+ This code is a "best effort" to comply with the HSR standard as
+ described in IEC 62439-3:2010 (HSRv0), but no compliancy tests have
+ been made.
+
+ You need to perform any and all necessary tests yourself before
+ relying on this code in a safety critical system!
+
+ If unsure, say N.
diff --git a/net/hsr/Makefile b/net/hsr/Makefile
new file mode 100644
index 000000000000..b68359f181cc
--- /dev/null
+++ b/net/hsr/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for HSR
+#
+
+obj-$(CONFIG_HSR) += hsr.o
+
+hsr-y := hsr_main.o hsr_framereg.o hsr_device.o hsr_netlink.o
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
new file mode 100644
index 000000000000..cac505f166d5
--- /dev/null
+++ b/net/hsr/hsr_device.c
@@ -0,0 +1,596 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ *
+ * This file contains device methods for creating, using and destroying
+ * virtual HSR devices.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include "hsr_device.h"
+#include "hsr_framereg.h"
+#include "hsr_main.h"
+
+
+static bool is_admin_up(struct net_device *dev)
+{
+ return dev && (dev->flags & IFF_UP);
+}
+
+static bool is_slave_up(struct net_device *dev)
+{
+ return dev && is_admin_up(dev) && netif_oper_up(dev);
+}
+
+static void __hsr_set_operstate(struct net_device *dev, int transition)
+{
+ write_lock_bh(&dev_base_lock);
+ if (dev->operstate != transition) {
+ dev->operstate = transition;
+ write_unlock_bh(&dev_base_lock);
+ netdev_state_change(dev);
+ } else {
+ write_unlock_bh(&dev_base_lock);
+ }
+}
+
+void hsr_set_operstate(struct net_device *hsr_dev, struct net_device *slave1,
+ struct net_device *slave2)
+{
+ if (!is_admin_up(hsr_dev)) {
+ __hsr_set_operstate(hsr_dev, IF_OPER_DOWN);
+ return;
+ }
+
+ if (is_slave_up(slave1) || is_slave_up(slave2))
+ __hsr_set_operstate(hsr_dev, IF_OPER_UP);
+ else
+ __hsr_set_operstate(hsr_dev, IF_OPER_LOWERLAYERDOWN);
+}
+
+void hsr_set_carrier(struct net_device *hsr_dev, struct net_device *slave1,
+ struct net_device *slave2)
+{
+ if (is_slave_up(slave1) || is_slave_up(slave2))
+ netif_carrier_on(hsr_dev);
+ else
+ netif_carrier_off(hsr_dev);
+}
+
+
+void hsr_check_announce(struct net_device *hsr_dev, int old_operstate)
+{
+ struct hsr_priv *hsr_priv;
+
+ hsr_priv = netdev_priv(hsr_dev);
+
+ if ((hsr_dev->operstate == IF_OPER_UP) && (old_operstate != IF_OPER_UP)) {
+ /* Went up */
+ hsr_priv->announce_count = 0;
+ hsr_priv->announce_timer.expires = jiffies +
+ msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+ add_timer(&hsr_priv->announce_timer);
+ }
+
+ if ((hsr_dev->operstate != IF_OPER_UP) && (old_operstate == IF_OPER_UP))
+ /* Went down */
+ del_timer(&hsr_priv->announce_timer);
+}
+
+
+int hsr_get_max_mtu(struct hsr_priv *hsr_priv)
+{
+ int mtu_max;
+
+ if (hsr_priv->slave[0] && hsr_priv->slave[1])
+ mtu_max = min(hsr_priv->slave[0]->mtu, hsr_priv->slave[1]->mtu);
+ else if (hsr_priv->slave[0])
+ mtu_max = hsr_priv->slave[0]->mtu;
+ else if (hsr_priv->slave[1])
+ mtu_max = hsr_priv->slave[1]->mtu;
+ else
+ mtu_max = HSR_TAGLEN;
+
+ return mtu_max - HSR_TAGLEN;
+}
+
+static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct hsr_priv *hsr_priv;
+
+ hsr_priv = netdev_priv(dev);
+
+ if (new_mtu > hsr_get_max_mtu(hsr_priv)) {
+ netdev_info(hsr_priv->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n",
+ HSR_TAGLEN);
+ return -EINVAL;
+ }
+
+ dev->mtu = new_mtu;
+
+ return 0;
+}
+
+static int hsr_dev_open(struct net_device *dev)
+{
+ struct hsr_priv *hsr_priv;
+ int i;
+ char *slave_name;
+
+ hsr_priv = netdev_priv(dev);
+
+ for (i = 0; i < HSR_MAX_SLAVE; i++) {
+ if (hsr_priv->slave[i])
+ slave_name = hsr_priv->slave[i]->name;
+ else
+ slave_name = "null";
+
+ if (!is_slave_up(hsr_priv->slave[i]))
+ netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a working HSR network\n",
+ 'A' + i, slave_name);
+ }
+
+ return 0;
+}
+
+static int hsr_dev_close(struct net_device *dev)
+{
+ /* Nothing to do here. We could try to restore the state of the slaves
+ * to what they were before being changed by the hsr master dev's state,
+ * but they might have been changed manually in the mean time too, so
+ * taking them up or down here might be confusing and is probably not a
+ * good idea.
+ */
+ return 0;
+}
+
+
+static void hsr_fill_tag(struct hsr_ethhdr *hsr_ethhdr, struct hsr_priv *hsr_priv)
+{
+ unsigned long irqflags;
+
+ /* IEC 62439-1:2010, p 48, says the 4-bit "path" field can take values
+ * between 0001-1001 ("ring identifier", for regular HSR frames),
+ * or 1111 ("HSR management", supervision frames). Unfortunately, the
+ * spec writers forgot to explain what a "ring identifier" is, or
+ * how it is used. So we just set this to 0001 for regular frames,
+ * and 1111 for supervision frames.
+ */
+ set_hsr_tag_path(&hsr_ethhdr->hsr_tag, 0x1);
+
+ /* IEC 62439-1:2010, p 12: "The link service data unit in an Ethernet
+ * frame is the content of the frame located between the Length/Type
+ * field and the Frame Check Sequence."
+ *
+ * IEC 62439-3, p 48, specifies the "original LPDU" to include the
+ * original "LT" field (what "LT" means is not explained anywhere as
+ * far as I can see - perhaps "Length/Type"?). So LSDU_size might
+ * equal original length + 2.
+ * Also, the fact that this field is not used anywhere (might be used
+ * by a RedBox connecting HSR and PRP nets?) means I cannot test its
+ * correctness. Instead of guessing, I set this to 0 here, to make any
+ * problems immediately apparent. Anyone using this driver with PRP/HSR
+ * RedBoxes might need to fix this...
+ */
+ set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, 0);
+
+ spin_lock_irqsave(&hsr_priv->seqnr_lock, irqflags);
+ hsr_ethhdr->hsr_tag.sequence_nr = htons(hsr_priv->sequence_nr);
+ hsr_priv->sequence_nr++;
+ spin_unlock_irqrestore(&hsr_priv->seqnr_lock, irqflags);
+
+ hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto;
+
+ hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP);
+}
+
+static int slave_xmit(struct sk_buff *skb, struct hsr_priv *hsr_priv,
+ enum hsr_dev_idx dev_idx)
+{
+ struct hsr_ethhdr *hsr_ethhdr;
+
+ hsr_ethhdr = (struct hsr_ethhdr *) skb->data;
+
+ skb->dev = hsr_priv->slave[dev_idx];
+
+ hsr_addr_subst_dest(hsr_priv, &hsr_ethhdr->ethhdr, dev_idx);
+
+ /* Address substitution (IEC62439-3 pp 26, 50): replace mac
+ * address of outgoing frame with that of the outgoing slave's.
+ */
+ memcpy(hsr_ethhdr->ethhdr.h_source, skb->dev->dev_addr, ETH_ALEN);
+
+ return dev_queue_xmit(skb);
+}
+
+
+static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct hsr_priv *hsr_priv;
+ struct hsr_ethhdr *hsr_ethhdr;
+ struct sk_buff *skb2;
+ int res1, res2;
+
+ hsr_priv = netdev_priv(dev);
+ hsr_ethhdr = (struct hsr_ethhdr *) skb->data;
+
+ if ((skb->protocol != htons(ETH_P_PRP)) ||
+ (hsr_ethhdr->ethhdr.h_proto != htons(ETH_P_PRP))) {
+ hsr_fill_tag(hsr_ethhdr, hsr_priv);
+ skb->protocol = htons(ETH_P_PRP);
+ }
+
+ skb2 = pskb_copy(skb, GFP_ATOMIC);
+
+ res1 = NET_XMIT_DROP;
+ if (likely(hsr_priv->slave[HSR_DEV_SLAVE_A]))
+ res1 = slave_xmit(skb, hsr_priv, HSR_DEV_SLAVE_A);
+
+ res2 = NET_XMIT_DROP;
+ if (likely(skb2 && hsr_priv->slave[HSR_DEV_SLAVE_B]))
+ res2 = slave_xmit(skb2, hsr_priv, HSR_DEV_SLAVE_B);
+
+ if (likely(res1 == NET_XMIT_SUCCESS || res1 == NET_XMIT_CN ||
+ res2 == NET_XMIT_SUCCESS || res2 == NET_XMIT_CN)) {
+ hsr_priv->dev->stats.tx_packets++;
+ hsr_priv->dev->stats.tx_bytes += skb->len;
+ } else {
+ hsr_priv->dev->stats.tx_dropped++;
+ }
+
+ return NETDEV_TX_OK;
+}
+
+
+static int hsr_header_create(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
+{
+ int res;
+
+ /* Make room for the HSR tag now. We will fill it in later (in
+ * hsr_dev_xmit)
+ */
+ if (skb_headroom(skb) < HSR_TAGLEN + ETH_HLEN)
+ return -ENOBUFS;
+ skb_push(skb, HSR_TAGLEN);
+
+ /* To allow VLAN/HSR combos we should probably use
+ * res = dev_hard_header(skb, dev, type, daddr, saddr, len + HSR_TAGLEN);
+ * here instead. It would require other changes too, though - e.g.
+ * separate headers for each slave etc...
+ */
+ res = eth_header(skb, dev, type, daddr, saddr, len + HSR_TAGLEN);
+ if (res <= 0)
+ return res;
+ skb_reset_mac_header(skb);
+
+ return res + HSR_TAGLEN;
+}
+
+
+static const struct header_ops hsr_header_ops = {
+ .create = hsr_header_create,
+ .parse = eth_header_parse,
+};
+
+
+/* HSR:2010 supervision frames should be padded so that the whole frame,
+ * including headers and FCS, is 64 bytes (without VLAN).
+ */
+static int hsr_pad(int size)
+{
+ const int min_size = ETH_ZLEN - HSR_TAGLEN - ETH_HLEN;
+
+ if (size >= min_size)
+ return size;
+ return min_size;
+}
+
+static void send_hsr_supervision_frame(struct net_device *hsr_dev, u8 type)
+{
+ struct hsr_priv *hsr_priv;
+ struct sk_buff *skb;
+ int hlen, tlen;
+ struct hsr_sup_tag *hsr_stag;
+ struct hsr_sup_payload *hsr_sp;
+ unsigned long irqflags;
+
+ hlen = LL_RESERVED_SPACE(hsr_dev);
+ tlen = hsr_dev->needed_tailroom;
+ skb = alloc_skb(hsr_pad(sizeof(struct hsr_sup_payload)) + hlen + tlen,
+ GFP_ATOMIC);
+
+ if (skb == NULL)
+ return;
+
+ hsr_priv = netdev_priv(hsr_dev);
+
+ skb_reserve(skb, hlen);
+
+ skb->dev = hsr_dev;
+ skb->protocol = htons(ETH_P_PRP);
+ skb->priority = TC_PRIO_CONTROL;
+
+ if (dev_hard_header(skb, skb->dev, ETH_P_PRP,
+ hsr_priv->sup_multicast_addr,
+ skb->dev->dev_addr, skb->len) < 0)
+ goto out;
+
+ skb_pull(skb, sizeof(struct ethhdr));
+ hsr_stag = (typeof(hsr_stag)) skb->data;
+
+ set_hsr_stag_path(hsr_stag, 0xf);
+ set_hsr_stag_HSR_Ver(hsr_stag, 0);
+
+ spin_lock_irqsave(&hsr_priv->seqnr_lock, irqflags);
+ hsr_stag->sequence_nr = htons(hsr_priv->sequence_nr);
+ hsr_priv->sequence_nr++;
+ spin_unlock_irqrestore(&hsr_priv->seqnr_lock, irqflags);
+
+ hsr_stag->HSR_TLV_Type = type;
+ hsr_stag->HSR_TLV_Length = 12;
+
+ skb_push(skb, sizeof(struct ethhdr));
+
+ /* Payload: MacAddressA */
+ hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(*hsr_sp));
+ memcpy(hsr_sp->MacAddressA, hsr_dev->dev_addr, ETH_ALEN);
+
+ dev_queue_xmit(skb);
+ return;
+
+out:
+ kfree_skb(skb);
+}
+
+
+/* Announce (supervision frame) timer function
+ */
+static void hsr_announce(unsigned long data)
+{
+ struct hsr_priv *hsr_priv;
+
+ hsr_priv = (struct hsr_priv *) data;
+
+ if (hsr_priv->announce_count < 3) {
+ send_hsr_supervision_frame(hsr_priv->dev, HSR_TLV_ANNOUNCE);
+ hsr_priv->announce_count++;
+ } else {
+ send_hsr_supervision_frame(hsr_priv->dev, HSR_TLV_LIFE_CHECK);
+ }
+
+ if (hsr_priv->announce_count < 3)
+ hsr_priv->announce_timer.expires = jiffies +
+ msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+ else
+ hsr_priv->announce_timer.expires = jiffies +
+ msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+
+ if (is_admin_up(hsr_priv->dev))
+ add_timer(&hsr_priv->announce_timer);
+}
+
+
+static void restore_slaves(struct net_device *hsr_dev)
+{
+ struct hsr_priv *hsr_priv;
+ int i;
+ int res;
+
+ hsr_priv = netdev_priv(hsr_dev);
+
+ rtnl_lock();
+
+ /* Restore promiscuity */
+ for (i = 0; i < HSR_MAX_SLAVE; i++) {
+ if (!hsr_priv->slave[i])
+ continue;
+ res = dev_set_promiscuity(hsr_priv->slave[i], -1);
+ if (res)
+ netdev_info(hsr_dev,
+ "Cannot restore slave promiscuity (%s, %d)\n",
+ hsr_priv->slave[i]->name, res);
+ }
+
+ rtnl_unlock();
+}
+
+static void reclaim_hsr_dev(struct rcu_head *rh)
+{
+ struct hsr_priv *hsr_priv;
+
+ hsr_priv = container_of(rh, struct hsr_priv, rcu_head);
+ free_netdev(hsr_priv->dev);
+}
+
+
+/* According to comments in the declaration of struct net_device, this function
+ * is "Called from unregister, can be used to call free_netdev". Ok then...
+ */
+static void hsr_dev_destroy(struct net_device *hsr_dev)
+{
+ struct hsr_priv *hsr_priv;
+
+ hsr_priv = netdev_priv(hsr_dev);
+
+ del_timer(&hsr_priv->announce_timer);
+ unregister_hsr_master(hsr_priv); /* calls list_del_rcu on hsr_priv */
+ restore_slaves(hsr_dev);
+ call_rcu(&hsr_priv->rcu_head, reclaim_hsr_dev); /* reclaim hsr_priv */
+}
+
+static const struct net_device_ops hsr_device_ops = {
+ .ndo_change_mtu = hsr_dev_change_mtu,
+ .ndo_open = hsr_dev_open,
+ .ndo_stop = hsr_dev_close,
+ .ndo_start_xmit = hsr_dev_xmit,
+};
+
+
+void hsr_dev_setup(struct net_device *dev)
+{
+ random_ether_addr(dev->dev_addr);
+
+ ether_setup(dev);
+ dev->header_ops = &hsr_header_ops;
+ dev->netdev_ops = &hsr_device_ops;
+ dev->tx_queue_len = 0;
+
+ dev->destructor = hsr_dev_destroy;
+}
+
+
+/* Return true if dev is a HSR master; return false otherwise.
+ */
+bool is_hsr_master(struct net_device *dev)
+{
+ return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit);
+}
+
+static int check_slave_ok(struct net_device *dev)
+{
+ /* Don't allow HSR on non-ethernet like devices */
+ if ((dev->flags & IFF_LOOPBACK) || (dev->type != ARPHRD_ETHER) ||
+ (dev->addr_len != ETH_ALEN)) {
+ netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n");
+ return -EINVAL;
+ }
+
+ /* Don't allow enslaving hsr devices */
+ if (is_hsr_master(dev)) {
+ netdev_info(dev, "Cannot create trees of HSR devices.\n");
+ return -EINVAL;
+ }
+
+ if (is_hsr_slave(dev)) {
+ netdev_info(dev, "This device is already a HSR slave.\n");
+ return -EINVAL;
+ }
+
+ if (dev->priv_flags & IFF_802_1Q_VLAN) {
+ netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
+ return -EINVAL;
+ }
+
+ /* HSR over bonded devices has not been tested, but I'm not sure it
+ * won't work...
+ */
+
+ return 0;
+}
+
+
+/* Default multicast address for HSR Supervision frames */
+static const unsigned char def_multicast_addr[ETH_ALEN] = {
+ 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00
+};
+
+int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
+ unsigned char multicast_spec)
+{
+ struct hsr_priv *hsr_priv;
+ int i;
+ int res;
+
+ hsr_priv = netdev_priv(hsr_dev);
+ hsr_priv->dev = hsr_dev;
+ INIT_LIST_HEAD(&hsr_priv->node_db);
+ INIT_LIST_HEAD(&hsr_priv->self_node_db);
+ for (i = 0; i < HSR_MAX_SLAVE; i++)
+ hsr_priv->slave[i] = slave[i];
+
+ spin_lock_init(&hsr_priv->seqnr_lock);
+ /* Overflow soon to find bugs easier: */
+ hsr_priv->sequence_nr = USHRT_MAX - 1024;
+
+ init_timer(&hsr_priv->announce_timer);
+ hsr_priv->announce_timer.function = hsr_announce;
+ hsr_priv->announce_timer.data = (unsigned long) hsr_priv;
+
+ memcpy(hsr_priv->sup_multicast_addr, def_multicast_addr, ETH_ALEN);
+ hsr_priv->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec;
+
+/* FIXME: should I modify the value of these?
+ *
+ * - hsr_dev->flags - i.e.
+ * IFF_MASTER/SLAVE?
+ * - hsr_dev->priv_flags - i.e.
+ * IFF_EBRIDGE?
+ * IFF_TX_SKB_SHARING?
+ * IFF_HSR_MASTER/SLAVE?
+ */
+
+ for (i = 0; i < HSR_MAX_SLAVE; i++) {
+ res = check_slave_ok(slave[i]);
+ if (res)
+ return res;
+ }
+
+ hsr_dev->features = slave[0]->features & slave[1]->features;
+ /* Prevent recursive tx locking */
+ hsr_dev->features |= NETIF_F_LLTX;
+ /* VLAN on top of HSR needs testing and probably some work on
+ * hsr_header_create() etc.
+ */
+ hsr_dev->features |= NETIF_F_VLAN_CHALLENGED;
+
+ /* Set hsr_dev's MAC address to that of mac_slave1 */
+ memcpy(hsr_dev->dev_addr, hsr_priv->slave[0]->dev_addr, ETH_ALEN);
+
+ /* Set required header length */
+ for (i = 0; i < HSR_MAX_SLAVE; i++) {
+ if (slave[i]->hard_header_len + HSR_TAGLEN >
+ hsr_dev->hard_header_len)
+ hsr_dev->hard_header_len =
+ slave[i]->hard_header_len + HSR_TAGLEN;
+ }
+
+ /* MTU */
+ for (i = 0; i < HSR_MAX_SLAVE; i++)
+ if (slave[i]->mtu - HSR_TAGLEN < hsr_dev->mtu)
+ hsr_dev->mtu = slave[i]->mtu - HSR_TAGLEN;
+
+ /* Make sure the 1st call to netif_carrier_on() gets through */
+ netif_carrier_off(hsr_dev);
+
+ /* Promiscuity */
+ for (i = 0; i < HSR_MAX_SLAVE; i++) {
+ res = dev_set_promiscuity(slave[i], 1);
+ if (res) {
+ netdev_info(hsr_dev, "Cannot set slave promiscuity (%s, %d)\n",
+ slave[i]->name, res);
+ goto fail;
+ }
+ }
+
+ /* Make sure we recognize frames from ourselves in hsr_rcv() */
+ res = hsr_create_self_node(&hsr_priv->self_node_db,
+ hsr_dev->dev_addr,
+ hsr_priv->slave[1]->dev_addr);
+ if (res < 0)
+ goto fail;
+
+ res = register_netdevice(hsr_dev);
+ if (res)
+ goto fail;
+
+ register_hsr_master(hsr_priv);
+
+ return 0;
+
+fail:
+ restore_slaves(hsr_dev);
+ return res;
+}
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
new file mode 100644
index 000000000000..2c7148e73914
--- /dev/null
+++ b/net/hsr/hsr_device.h
@@ -0,0 +1,29 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ */
+
+#ifndef __HSR_DEVICE_H
+#define __HSR_DEVICE_H
+
+#include <linux/netdevice.h>
+#include "hsr_main.h"
+
+void hsr_dev_setup(struct net_device *dev);
+int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
+ unsigned char multicast_spec);
+void hsr_set_operstate(struct net_device *hsr_dev, struct net_device *slave1,
+ struct net_device *slave2);
+void hsr_set_carrier(struct net_device *hsr_dev, struct net_device *slave1,
+ struct net_device *slave2);
+void hsr_check_announce(struct net_device *hsr_dev, int old_operstate);
+bool is_hsr_master(struct net_device *dev);
+int hsr_get_max_mtu(struct hsr_priv *hsr_priv);
+
+#endif /* __HSR_DEVICE_H */
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
new file mode 100644
index 000000000000..003f5bb3acd2
--- /dev/null
+++ b/net/hsr/hsr_framereg.c
@@ -0,0 +1,503 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ *
+ * The HSR spec says never to forward the same frame twice on the same
+ * interface. A frame is identified by its source MAC address and its HSR
+ * sequence number. This code keeps track of senders and their sequence numbers
+ * to allow filtering of duplicate frames, and to detect HSR ring errors.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+#include "hsr_netlink.h"
+
+
+struct node_entry {
+ struct list_head mac_list;
+ unsigned char MacAddressA[ETH_ALEN];
+ unsigned char MacAddressB[ETH_ALEN];
+ enum hsr_dev_idx AddrB_if; /* The local slave through which AddrB
+ * frames are received from this node
+ */
+ unsigned long time_in[HSR_MAX_SLAVE];
+ bool time_in_stale[HSR_MAX_SLAVE];
+ u16 seq_out[HSR_MAX_DEV];
+ struct rcu_head rcu_head;
+};
+
+/* TODO: use hash lists for mac addresses (linux/jhash.h)? */
+
+
+
+/* Search for mac entry. Caller must hold rcu read lock.
+ */
+static struct node_entry *find_node_by_AddrA(struct list_head *node_db,
+ const unsigned char addr[ETH_ALEN])
+{
+ struct node_entry *node;
+
+ list_for_each_entry_rcu(node, node_db, mac_list) {
+ if (ether_addr_equal(node->MacAddressA, addr))
+ return node;
+ }
+
+ return NULL;
+}
+
+
+/* Search for mac entry. Caller must hold rcu read lock.
+ */
+static struct node_entry *find_node_by_AddrB(struct list_head *node_db,
+ const unsigned char addr[ETH_ALEN])
+{
+ struct node_entry *node;
+
+ list_for_each_entry_rcu(node, node_db, mac_list) {
+ if (ether_addr_equal(node->MacAddressB, addr))
+ return node;
+ }
+
+ return NULL;
+}
+
+
+/* Search for mac entry. Caller must hold rcu read lock.
+ */
+struct node_entry *hsr_find_node(struct list_head *node_db, struct sk_buff *skb)
+{
+ struct node_entry *node;
+ struct ethhdr *ethhdr;
+
+ if (!skb_mac_header_was_set(skb))
+ return NULL;
+
+ ethhdr = (struct ethhdr *) skb_mac_header(skb);
+
+ list_for_each_entry_rcu(node, node_db, mac_list) {
+ if (ether_addr_equal(node->MacAddressA, ethhdr->h_source))
+ return node;
+ if (ether_addr_equal(node->MacAddressB, ethhdr->h_source))
+ return node;
+ }
+
+ return NULL;
+}
+
+
+/* Helper for device init; the self_node_db is used in hsr_rcv() to recognize
+ * frames from self that's been looped over the HSR ring.
+ */
+int hsr_create_self_node(struct list_head *self_node_db,
+ unsigned char addr_a[ETH_ALEN],
+ unsigned char addr_b[ETH_ALEN])
+{
+ struct node_entry *node, *oldnode;
+
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ memcpy(node->MacAddressA, addr_a, ETH_ALEN);
+ memcpy(node->MacAddressB, addr_b, ETH_ALEN);
+
+ rcu_read_lock();
+ oldnode = list_first_or_null_rcu(self_node_db,
+ struct node_entry, mac_list);
+ if (oldnode) {
+ list_replace_rcu(&oldnode->mac_list, &node->mac_list);
+ rcu_read_unlock();
+ synchronize_rcu();
+ kfree(oldnode);
+ } else {
+ rcu_read_unlock();
+ list_add_tail_rcu(&node->mac_list, self_node_db);
+ }
+
+ return 0;
+}
+
+static void node_entry_reclaim(struct rcu_head *rh)
+{
+ kfree(container_of(rh, struct node_entry, rcu_head));
+}
+
+
+/* Add/merge node to the database of nodes. 'skb' must contain an HSR
+ * supervision frame.
+ * - If the supervision header's MacAddressA field is not yet in the database,
+ * this frame is from an hitherto unknown node - add it to the database.
+ * - If the sender's MAC address is not the same as its MacAddressA address,
+ * the node is using PICS_SUBS (address substitution). Record the sender's
+ * address as the node's MacAddressB.
+ *
+ * This function needs to work even if the sender node has changed one of its
+ * slaves' MAC addresses. In this case, there are four different cases described
+ * by (Addr-changed, received-from) pairs as follows. Note that changing the
+ * SlaveA address is equal to changing the node's own address:
+ *
+ * - (AddrB, SlaveB): The new AddrB will be recorded by PICS_SUBS code since
+ * node == NULL.
+ * - (AddrB, SlaveA): Will work as usual (the AddrB change won't be detected
+ * from this frame).
+ *
+ * - (AddrA, SlaveB): The old node will be found. We need to detect this and
+ * remove the node.
+ * - (AddrA, SlaveA): A new node will be registered (non-PICS_SUBS at first).
+ * The old one will be pruned after HSR_NODE_FORGET_TIME.
+ *
+ * We also need to detect if the sender's SlaveA and SlaveB cables have been
+ * swapped.
+ */
+struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv,
+ struct node_entry *node,
+ struct sk_buff *skb,
+ enum hsr_dev_idx dev_idx)
+{
+ struct hsr_sup_payload *hsr_sp;
+ struct hsr_ethhdr_sp *hsr_ethsup;
+ int i;
+ unsigned long now;
+
+ hsr_ethsup = (struct hsr_ethhdr_sp *) skb_mac_header(skb);
+ hsr_sp = (struct hsr_sup_payload *) skb->data;
+
+ if (node && !ether_addr_equal(node->MacAddressA, hsr_sp->MacAddressA)) {
+ /* Node has changed its AddrA, frame was received from SlaveB */
+ list_del_rcu(&node->mac_list);
+ call_rcu(&node->rcu_head, node_entry_reclaim);
+ node = NULL;
+ }
+
+ if (node && (dev_idx == node->AddrB_if) &&
+ !ether_addr_equal(node->MacAddressB, hsr_ethsup->ethhdr.h_source)) {
+ /* Cables have been swapped */
+ list_del_rcu(&node->mac_list);
+ call_rcu(&node->rcu_head, node_entry_reclaim);
+ node = NULL;
+ }
+
+ if (node && (dev_idx != node->AddrB_if) &&
+ (node->AddrB_if != HSR_DEV_NONE) &&
+ !ether_addr_equal(node->MacAddressA, hsr_ethsup->ethhdr.h_source)) {
+ /* Cables have been swapped */
+ list_del_rcu(&node->mac_list);
+ call_rcu(&node->rcu_head, node_entry_reclaim);
+ node = NULL;
+ }
+
+ if (node)
+ return node;
+
+ node = find_node_by_AddrA(&hsr_priv->node_db, hsr_sp->MacAddressA);
+ if (node) {
+ /* Node is known, but frame was received from an unknown
+ * address. Node is PICS_SUBS capable; merge its AddrB.
+ */
+ memcpy(node->MacAddressB, hsr_ethsup->ethhdr.h_source, ETH_ALEN);
+ node->AddrB_if = dev_idx;
+ return node;
+ }
+
+ node = kzalloc(sizeof(*node), GFP_ATOMIC);
+ if (!node)
+ return NULL;
+
+ memcpy(node->MacAddressA, hsr_sp->MacAddressA, ETH_ALEN);
+ memcpy(node->MacAddressB, hsr_ethsup->ethhdr.h_source, ETH_ALEN);
+ if (!ether_addr_equal(hsr_sp->MacAddressA, hsr_ethsup->ethhdr.h_source))
+ node->AddrB_if = dev_idx;
+ else
+ node->AddrB_if = HSR_DEV_NONE;
+
+ /* We are only interested in time diffs here, so use current jiffies
+ * as initialization. (0 could trigger an spurious ring error warning).
+ */
+ now = jiffies;
+ for (i = 0; i < HSR_MAX_SLAVE; i++)
+ node->time_in[i] = now;
+ for (i = 0; i < HSR_MAX_DEV; i++)
+ node->seq_out[i] = ntohs(hsr_ethsup->hsr_sup.sequence_nr) - 1;
+
+ list_add_tail_rcu(&node->mac_list, &hsr_priv->node_db);
+
+ return node;
+}
+
+
+/* 'skb' is a frame meant for this host, that is to be passed to upper layers.
+ *
+ * If the frame was sent by a node's B interface, replace the sender
+ * address with that node's "official" address (MacAddressA) so that upper
+ * layers recognize where it came from.
+ */
+void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb)
+{
+ struct ethhdr *ethhdr;
+ struct node_entry *node;
+
+ if (!skb_mac_header_was_set(skb)) {
+ WARN_ONCE(1, "%s: Mac header not set\n", __func__);
+ return;
+ }
+ ethhdr = (struct ethhdr *) skb_mac_header(skb);
+
+ rcu_read_lock();
+ node = find_node_by_AddrB(&hsr_priv->node_db, ethhdr->h_source);
+ if (node)
+ memcpy(ethhdr->h_source, node->MacAddressA, ETH_ALEN);
+ rcu_read_unlock();
+}
+
+
+/* 'skb' is a frame meant for another host.
+ * 'hsr_dev_idx' is the HSR index of the outgoing device
+ *
+ * Substitute the target (dest) MAC address if necessary, so the it matches the
+ * recipient interface MAC address, regardless of whether that is the
+ * recipient's A or B interface.
+ * This is needed to keep the packets flowing through switches that learn on
+ * which "side" the different interfaces are.
+ */
+void hsr_addr_subst_dest(struct hsr_priv *hsr_priv, struct ethhdr *ethhdr,
+ enum hsr_dev_idx dev_idx)
+{
+ struct node_entry *node;
+
+ rcu_read_lock();
+ node = find_node_by_AddrA(&hsr_priv->node_db, ethhdr->h_dest);
+ if (node && (node->AddrB_if == dev_idx))
+ memcpy(ethhdr->h_dest, node->MacAddressB, ETH_ALEN);
+ rcu_read_unlock();
+}
+
+
+/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b,
+ * false otherwise.
+ */
+static bool seq_nr_after(u16 a, u16 b)
+{
+ /* Remove inconsistency where
+ * seq_nr_after(a, b) == seq_nr_before(a, b) */
+ if ((int) b - a == 32768)
+ return false;
+
+ return (((s16) (b - a)) < 0);
+}
+#define seq_nr_before(a, b) seq_nr_after((b), (a))
+#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b)))
+#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b)))
+
+
+void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx)
+{
+ if ((dev_idx < 0) || (dev_idx >= HSR_MAX_DEV)) {
+ WARN_ONCE(1, "%s: Invalid dev_idx (%d)\n", __func__, dev_idx);
+ return;
+ }
+ node->time_in[dev_idx] = jiffies;
+ node->time_in_stale[dev_idx] = false;
+}
+
+
+/* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid
+ * ethhdr->h_source address and skb->mac_header set.
+ *
+ * Return:
+ * 1 if frame can be shown to have been sent recently on this interface,
+ * 0 otherwise, or
+ * negative error code on error
+ */
+int hsr_register_frame_out(struct node_entry *node, enum hsr_dev_idx dev_idx,
+ struct sk_buff *skb)
+{
+ struct hsr_ethhdr *hsr_ethhdr;
+ u16 sequence_nr;
+
+ if ((dev_idx < 0) || (dev_idx >= HSR_MAX_DEV)) {
+ WARN_ONCE(1, "%s: Invalid dev_idx (%d)\n", __func__, dev_idx);
+ return -EINVAL;
+ }
+ if (!skb_mac_header_was_set(skb)) {
+ WARN_ONCE(1, "%s: Mac header not set\n", __func__);
+ return -EINVAL;
+ }
+ hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb);
+
+ sequence_nr = ntohs(hsr_ethhdr->hsr_tag.sequence_nr);
+ if (seq_nr_before_or_eq(sequence_nr, node->seq_out[dev_idx]))
+ return 1;
+
+ node->seq_out[dev_idx] = sequence_nr;
+ return 0;
+}
+
+
+
+static bool is_late(struct node_entry *node, enum hsr_dev_idx dev_idx)
+{
+ enum hsr_dev_idx other;
+
+ if (node->time_in_stale[dev_idx])
+ return true;
+
+ if (dev_idx == HSR_DEV_SLAVE_A)
+ other = HSR_DEV_SLAVE_B;
+ else
+ other = HSR_DEV_SLAVE_A;
+
+ if (node->time_in_stale[other])
+ return false;
+
+ if (time_after(node->time_in[other], node->time_in[dev_idx] +
+ msecs_to_jiffies(MAX_SLAVE_DIFF)))
+ return true;
+
+ return false;
+}
+
+
+/* Remove stale sequence_nr records. Called by timer every
+ * HSR_LIFE_CHECK_INTERVAL (two seconds or so).
+ */
+void hsr_prune_nodes(struct hsr_priv *hsr_priv)
+{
+ struct node_entry *node;
+ unsigned long timestamp;
+ unsigned long time_a, time_b;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(node, &hsr_priv->node_db, mac_list) {
+ /* Shorthand */
+ time_a = node->time_in[HSR_DEV_SLAVE_A];
+ time_b = node->time_in[HSR_DEV_SLAVE_B];
+
+ /* Check for timestamps old enough to risk wrap-around */
+ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET/2))
+ node->time_in_stale[HSR_DEV_SLAVE_A] = true;
+ if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET/2))
+ node->time_in_stale[HSR_DEV_SLAVE_B] = true;
+
+ /* Get age of newest frame from node.
+ * At least one time_in is OK here; nodes get pruned long
+ * before both time_ins can get stale
+ */
+ timestamp = time_a;
+ if (node->time_in_stale[HSR_DEV_SLAVE_A] ||
+ (!node->time_in_stale[HSR_DEV_SLAVE_B] &&
+ time_after(time_b, time_a)))
+ timestamp = time_b;
+
+ /* Warn of ring error only as long as we get frames at all */
+ if (time_is_after_jiffies(timestamp +
+ msecs_to_jiffies(1.5*MAX_SLAVE_DIFF))) {
+
+ if (is_late(node, HSR_DEV_SLAVE_A))
+ hsr_nl_ringerror(hsr_priv, node->MacAddressA,
+ HSR_DEV_SLAVE_A);
+ else if (is_late(node, HSR_DEV_SLAVE_B))
+ hsr_nl_ringerror(hsr_priv, node->MacAddressA,
+ HSR_DEV_SLAVE_B);
+ }
+
+ /* Prune old entries */
+ if (time_is_before_jiffies(timestamp +
+ msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
+ hsr_nl_nodedown(hsr_priv, node->MacAddressA);
+ list_del_rcu(&node->mac_list);
+ /* Note that we need to free this entry later: */
+ call_rcu(&node->rcu_head, node_entry_reclaim);
+ }
+ }
+ rcu_read_unlock();
+}
+
+
+void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos,
+ unsigned char addr[ETH_ALEN])
+{
+ struct node_entry *node;
+
+ if (!_pos) {
+ node = list_first_or_null_rcu(&hsr_priv->node_db,
+ struct node_entry, mac_list);
+ if (node)
+ memcpy(addr, node->MacAddressA, ETH_ALEN);
+ return node;
+ }
+
+ node = _pos;
+ list_for_each_entry_continue_rcu(node, &hsr_priv->node_db, mac_list) {
+ memcpy(addr, node->MacAddressA, ETH_ALEN);
+ return node;
+ }
+
+ return NULL;
+}
+
+
+int hsr_get_node_data(struct hsr_priv *hsr_priv,
+ const unsigned char *addr,
+ unsigned char addr_b[ETH_ALEN],
+ unsigned int *addr_b_ifindex,
+ int *if1_age,
+ u16 *if1_seq,
+ int *if2_age,
+ u16 *if2_seq)
+{
+ struct node_entry *node;
+ unsigned long tdiff;
+
+
+ rcu_read_lock();
+ node = find_node_by_AddrA(&hsr_priv->node_db, addr);
+ if (!node) {
+ rcu_read_unlock();
+ return -ENOENT; /* No such entry */
+ }
+
+ memcpy(addr_b, node->MacAddressB, ETH_ALEN);
+
+ tdiff = jiffies - node->time_in[HSR_DEV_SLAVE_A];
+ if (node->time_in_stale[HSR_DEV_SLAVE_A])
+ *if1_age = INT_MAX;
+#if HZ <= MSEC_PER_SEC
+ else if (tdiff > msecs_to_jiffies(INT_MAX))
+ *if1_age = INT_MAX;
+#endif
+ else
+ *if1_age = jiffies_to_msecs(tdiff);
+
+ tdiff = jiffies - node->time_in[HSR_DEV_SLAVE_B];
+ if (node->time_in_stale[HSR_DEV_SLAVE_B])
+ *if2_age = INT_MAX;
+#if HZ <= MSEC_PER_SEC
+ else if (tdiff > msecs_to_jiffies(INT_MAX))
+ *if2_age = INT_MAX;
+#endif
+ else
+ *if2_age = jiffies_to_msecs(tdiff);
+
+ /* Present sequence numbers as if they were incoming on interface */
+ *if1_seq = node->seq_out[HSR_DEV_SLAVE_B];
+ *if2_seq = node->seq_out[HSR_DEV_SLAVE_A];
+
+ if ((node->AddrB_if != HSR_DEV_NONE) && hsr_priv->slave[node->AddrB_if])
+ *addr_b_ifindex = hsr_priv->slave[node->AddrB_if]->ifindex;
+ else
+ *addr_b_ifindex = -1;
+
+ rcu_read_unlock();
+
+ return 0;
+}
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
new file mode 100644
index 000000000000..e6c4022030ad
--- /dev/null
+++ b/net/hsr/hsr_framereg.h
@@ -0,0 +1,53 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ */
+
+#ifndef _HSR_FRAMEREG_H
+#define _HSR_FRAMEREG_H
+
+#include "hsr_main.h"
+
+struct node_entry;
+
+struct node_entry *hsr_find_node(struct list_head *node_db, struct sk_buff *skb);
+
+struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv,
+ struct node_entry *node,
+ struct sk_buff *skb,
+ enum hsr_dev_idx dev_idx);
+
+void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb);
+void hsr_addr_subst_dest(struct hsr_priv *hsr_priv, struct ethhdr *ethhdr,
+ enum hsr_dev_idx dev_idx);
+
+void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx);
+
+int hsr_register_frame_out(struct node_entry *node, enum hsr_dev_idx dev_idx,
+ struct sk_buff *skb);
+
+void hsr_prune_nodes(struct hsr_priv *hsr_priv);
+
+int hsr_create_self_node(struct list_head *self_node_db,
+ unsigned char addr_a[ETH_ALEN],
+ unsigned char addr_b[ETH_ALEN]);
+
+void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos,
+ unsigned char addr[ETH_ALEN]);
+
+int hsr_get_node_data(struct hsr_priv *hsr_priv,
+ const unsigned char *addr,
+ unsigned char addr_b[ETH_ALEN],
+ unsigned int *addr_b_ifindex,
+ int *if1_age,
+ u16 *if1_seq,
+ int *if2_age,
+ u16 *if2_seq);
+
+#endif /* _HSR_FRAMEREG_H */
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
new file mode 100644
index 000000000000..af68dd83a4e3
--- /dev/null
+++ b/net/hsr/hsr_main.c
@@ -0,0 +1,469 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ *
+ * In addition to routines for registering and unregistering HSR support, this
+ * file also contains the receive routine that handles all incoming frames with
+ * Ethertype (protocol) ETH_P_PRP (HSRv0), and network device event handling.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/timer.h>
+#include <linux/etherdevice.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_netlink.h"
+#include "hsr_framereg.h"
+
+
+/* List of all registered virtual HSR devices */
+static LIST_HEAD(hsr_list);
+
+void register_hsr_master(struct hsr_priv *hsr_priv)
+{
+ list_add_tail_rcu(&hsr_priv->hsr_list, &hsr_list);
+}
+
+void unregister_hsr_master(struct hsr_priv *hsr_priv)
+{
+ struct hsr_priv *hsr_priv_it;
+
+ list_for_each_entry(hsr_priv_it, &hsr_list, hsr_list)
+ if (hsr_priv_it == hsr_priv) {
+ list_del_rcu(&hsr_priv_it->hsr_list);
+ return;
+ }
+}
+
+bool is_hsr_slave(struct net_device *dev)
+{
+ struct hsr_priv *hsr_priv_it;
+
+ list_for_each_entry_rcu(hsr_priv_it, &hsr_list, hsr_list) {
+ if (dev == hsr_priv_it->slave[0])
+ return true;
+ if (dev == hsr_priv_it->slave[1])
+ return true;
+ }
+
+ return false;
+}
+
+
+/* If dev is a HSR slave device, return the virtual master device. Return NULL
+ * otherwise.
+ */
+static struct hsr_priv *get_hsr_master(struct net_device *dev)
+{
+ struct hsr_priv *hsr_priv;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hsr_priv, &hsr_list, hsr_list)
+ if ((dev == hsr_priv->slave[0]) ||
+ (dev == hsr_priv->slave[1])) {
+ rcu_read_unlock();
+ return hsr_priv;
+ }
+
+ rcu_read_unlock();
+ return NULL;
+}
+
+
+/* If dev is a HSR slave device, return the other slave device. Return NULL
+ * otherwise.
+ */
+static struct net_device *get_other_slave(struct hsr_priv *hsr_priv,
+ struct net_device *dev)
+{
+ if (dev == hsr_priv->slave[0])
+ return hsr_priv->slave[1];
+ if (dev == hsr_priv->slave[1])
+ return hsr_priv->slave[0];
+
+ return NULL;
+}
+
+
+static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *slave, *other_slave;
+ struct hsr_priv *hsr_priv;
+ int old_operstate;
+ int mtu_max;
+ int res;
+ struct net_device *dev;
+
+ dev = netdev_notifier_info_to_dev(ptr);
+
+ hsr_priv = get_hsr_master(dev);
+ if (hsr_priv) {
+ /* dev is a slave device */
+ slave = dev;
+ other_slave = get_other_slave(hsr_priv, slave);
+ } else {
+ if (!is_hsr_master(dev))
+ return NOTIFY_DONE;
+ hsr_priv = netdev_priv(dev);
+ slave = hsr_priv->slave[0];
+ other_slave = hsr_priv->slave[1];
+ }
+
+ switch (event) {
+ case NETDEV_UP: /* Administrative state DOWN */
+ case NETDEV_DOWN: /* Administrative state UP */
+ case NETDEV_CHANGE: /* Link (carrier) state changes */
+ old_operstate = hsr_priv->dev->operstate;
+ hsr_set_carrier(hsr_priv->dev, slave, other_slave);
+ /* netif_stacked_transfer_operstate() cannot be used here since
+ * it doesn't set IF_OPER_LOWERLAYERDOWN (?)
+ */
+ hsr_set_operstate(hsr_priv->dev, slave, other_slave);
+ hsr_check_announce(hsr_priv->dev, old_operstate);
+ break;
+ case NETDEV_CHANGEADDR:
+
+ /* This should not happen since there's no ndo_set_mac_address()
+ * for HSR devices - i.e. not supported.
+ */
+ if (dev == hsr_priv->dev)
+ break;
+
+ if (dev == hsr_priv->slave[0])
+ memcpy(hsr_priv->dev->dev_addr,
+ hsr_priv->slave[0]->dev_addr, ETH_ALEN);
+
+ /* Make sure we recognize frames from ourselves in hsr_rcv() */
+ res = hsr_create_self_node(&hsr_priv->self_node_db,
+ hsr_priv->dev->dev_addr,
+ hsr_priv->slave[1] ?
+ hsr_priv->slave[1]->dev_addr :
+ hsr_priv->dev->dev_addr);
+ if (res)
+ netdev_warn(hsr_priv->dev,
+ "Could not update HSR node address.\n");
+
+ if (dev == hsr_priv->slave[0])
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, hsr_priv->dev);
+ break;
+ case NETDEV_CHANGEMTU:
+ if (dev == hsr_priv->dev)
+ break; /* Handled in ndo_change_mtu() */
+ mtu_max = hsr_get_max_mtu(hsr_priv);
+ if (hsr_priv->dev->mtu > mtu_max)
+ dev_set_mtu(hsr_priv->dev, mtu_max);
+ break;
+ case NETDEV_UNREGISTER:
+ if (dev == hsr_priv->slave[0])
+ hsr_priv->slave[0] = NULL;
+ if (dev == hsr_priv->slave[1])
+ hsr_priv->slave[1] = NULL;
+
+ /* There should really be a way to set a new slave device... */
+
+ break;
+ case NETDEV_PRE_TYPE_CHANGE:
+ /* HSR works only on Ethernet devices. Refuse slave to change
+ * its type.
+ */
+ return NOTIFY_BAD;
+ }
+
+ return NOTIFY_DONE;
+}
+
+
+static struct timer_list prune_timer;
+
+static void prune_nodes_all(unsigned long data)
+{
+ struct hsr_priv *hsr_priv;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hsr_priv, &hsr_list, hsr_list)
+ hsr_prune_nodes(hsr_priv);
+ rcu_read_unlock();
+
+ prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD);
+ add_timer(&prune_timer);
+}
+
+
+static struct sk_buff *hsr_pull_tag(struct sk_buff *skb)
+{
+ struct hsr_tag *hsr_tag;
+ struct sk_buff *skb2;
+
+ skb2 = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb2))
+ goto err_free;
+ skb = skb2;
+
+ if (unlikely(!pskb_may_pull(skb, HSR_TAGLEN)))
+ goto err_free;
+
+ hsr_tag = (struct hsr_tag *) skb->data;
+ skb->protocol = hsr_tag->encap_proto;
+ skb_pull(skb, HSR_TAGLEN);
+
+ return skb;
+
+err_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
+
+/* The uses I can see for these HSR supervision frames are:
+ * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type =
+ * 22") to reset any sequence_nr counters belonging to that node. Useful if
+ * the other node's counter has been reset for some reason.
+ * --
+ * Or not - resetting the counter and bridging the frame would create a
+ * loop, unfortunately.
+ *
+ * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck
+ * frame is received from a particular node, we know something is wrong.
+ * We just register these (as with normal frames) and throw them away.
+ *
+ * 3) Allow different MAC addresses for the two slave interfaces, using the
+ * MacAddressA field.
+ */
+static bool is_supervision_frame(struct hsr_priv *hsr_priv, struct sk_buff *skb)
+{
+ struct hsr_sup_tag *hsr_stag;
+
+ if (!ether_addr_equal(eth_hdr(skb)->h_dest,
+ hsr_priv->sup_multicast_addr))
+ return false;
+
+ hsr_stag = (struct hsr_sup_tag *) skb->data;
+ if (get_hsr_stag_path(hsr_stag) != 0x0f)
+ return false;
+ if ((hsr_stag->HSR_TLV_Type != HSR_TLV_ANNOUNCE) &&
+ (hsr_stag->HSR_TLV_Type != HSR_TLV_LIFE_CHECK))
+ return false;
+ if (hsr_stag->HSR_TLV_Length != 12)
+ return false;
+
+ return true;
+}
+
+
+/* Implementation somewhat according to IEC-62439-3, p. 43
+ */
+static int hsr_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct hsr_priv *hsr_priv;
+ struct net_device *other_slave;
+ struct node_entry *node;
+ bool deliver_to_self;
+ struct sk_buff *skb_deliver;
+ enum hsr_dev_idx dev_in_idx, dev_other_idx;
+ bool dup_out;
+ int ret;
+
+ hsr_priv = get_hsr_master(dev);
+
+ if (!hsr_priv) {
+ /* Non-HSR-slave device 'dev' is connected to a HSR network */
+ kfree_skb(skb);
+ dev->stats.rx_errors++;
+ return NET_RX_SUCCESS;
+ }
+
+ if (dev == hsr_priv->slave[0]) {
+ dev_in_idx = HSR_DEV_SLAVE_A;
+ dev_other_idx = HSR_DEV_SLAVE_B;
+ } else {
+ dev_in_idx = HSR_DEV_SLAVE_B;
+ dev_other_idx = HSR_DEV_SLAVE_A;
+ }
+
+ node = hsr_find_node(&hsr_priv->self_node_db, skb);
+ if (node) {
+ /* Always kill frames sent by ourselves */
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
+ /* Is this frame a candidate for local reception? */
+ deliver_to_self = false;
+ if ((skb->pkt_type == PACKET_HOST) ||
+ (skb->pkt_type == PACKET_MULTICAST) ||
+ (skb->pkt_type == PACKET_BROADCAST))
+ deliver_to_self = true;
+ else if (ether_addr_equal(eth_hdr(skb)->h_dest,
+ hsr_priv->dev->dev_addr)) {
+ skb->pkt_type = PACKET_HOST;
+ deliver_to_self = true;
+ }
+
+
+ rcu_read_lock(); /* node_db */
+ node = hsr_find_node(&hsr_priv->node_db, skb);
+
+ if (is_supervision_frame(hsr_priv, skb)) {
+ skb_pull(skb, sizeof(struct hsr_sup_tag));
+ node = hsr_merge_node(hsr_priv, node, skb, dev_in_idx);
+ if (!node) {
+ rcu_read_unlock(); /* node_db */
+ kfree_skb(skb);
+ hsr_priv->dev->stats.rx_dropped++;
+ return NET_RX_DROP;
+ }
+ skb_push(skb, sizeof(struct hsr_sup_tag));
+ deliver_to_self = false;
+ }
+
+ if (!node) {
+ /* Source node unknown; this might be a HSR frame from
+ * another net (different multicast address). Ignore it.
+ */
+ rcu_read_unlock(); /* node_db */
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
+ /* Register ALL incoming frames as outgoing through the other interface.
+ * This allows us to register frames as incoming only if they are valid
+ * for the receiving interface, without using a specific counter for
+ * incoming frames.
+ */
+ dup_out = hsr_register_frame_out(node, dev_other_idx, skb);
+ if (!dup_out)
+ hsr_register_frame_in(node, dev_in_idx);
+
+ /* Forward this frame? */
+ if (!dup_out && (skb->pkt_type != PACKET_HOST))
+ other_slave = get_other_slave(hsr_priv, dev);
+ else
+ other_slave = NULL;
+
+ if (hsr_register_frame_out(node, HSR_DEV_MASTER, skb))
+ deliver_to_self = false;
+
+ rcu_read_unlock(); /* node_db */
+
+ if (!deliver_to_self && !other_slave) {
+ kfree_skb(skb);
+ /* Circulated frame; silently remove it. */
+ return NET_RX_SUCCESS;
+ }
+
+ skb_deliver = skb;
+ if (deliver_to_self && other_slave) {
+ /* skb_clone() is not enough since we will strip the hsr tag
+ * and do address substitution below
+ */
+ skb_deliver = pskb_copy(skb, GFP_ATOMIC);
+ if (!skb_deliver) {
+ deliver_to_self = false;
+ hsr_priv->dev->stats.rx_dropped++;
+ }
+ }
+
+ if (deliver_to_self) {
+ bool multicast_frame;
+
+ skb_deliver = hsr_pull_tag(skb_deliver);
+ if (!skb_deliver) {
+ hsr_priv->dev->stats.rx_dropped++;
+ goto forward;
+ }
+#if !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+ /* Move everything in the header that is after the HSR tag,
+ * to work around alignment problems caused by the 6-byte HSR
+ * tag. In practice, this removes/overwrites the HSR tag in
+ * the header and restores a "standard" packet.
+ */
+ memmove(skb_deliver->data - HSR_TAGLEN, skb_deliver->data,
+ skb_headlen(skb_deliver));
+
+ /* Adjust skb members so they correspond with the move above.
+ * This cannot possibly underflow skb->data since hsr_pull_tag()
+ * above succeeded.
+ * At this point in the protocol stack, the transport and
+ * network headers have not been set yet, and we haven't touched
+ * the mac header nor the head. So we only need to adjust data
+ * and tail:
+ */
+ skb_deliver->data -= HSR_TAGLEN;
+ skb_deliver->tail -= HSR_TAGLEN;
+#endif
+ skb_deliver->dev = hsr_priv->dev;
+ hsr_addr_subst_source(hsr_priv, skb_deliver);
+ multicast_frame = (skb_deliver->pkt_type == PACKET_MULTICAST);
+ ret = netif_rx(skb_deliver);
+ if (ret == NET_RX_DROP) {
+ hsr_priv->dev->stats.rx_dropped++;
+ } else {
+ hsr_priv->dev->stats.rx_packets++;
+ hsr_priv->dev->stats.rx_bytes += skb->len;
+ if (multicast_frame)
+ hsr_priv->dev->stats.multicast++;
+ }
+ }
+
+forward:
+ if (other_slave) {
+ skb_push(skb, ETH_HLEN);
+ skb->dev = other_slave;
+ dev_queue_xmit(skb);
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+
+static struct packet_type hsr_pt __read_mostly = {
+ .type = htons(ETH_P_PRP),
+ .func = hsr_rcv,
+};
+
+static struct notifier_block hsr_nb = {
+ .notifier_call = hsr_netdev_notify, /* Slave event notifications */
+};
+
+
+static int __init hsr_init(void)
+{
+ int res;
+
+ BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_TAGLEN);
+
+ dev_add_pack(&hsr_pt);
+
+ init_timer(&prune_timer);
+ prune_timer.function = prune_nodes_all;
+ prune_timer.data = 0;
+ prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD);
+ add_timer(&prune_timer);
+
+ register_netdevice_notifier(&hsr_nb);
+
+ res = hsr_netlink_init();
+
+ return res;
+}
+
+static void __exit hsr_exit(void)
+{
+ unregister_netdevice_notifier(&hsr_nb);
+ del_timer(&prune_timer);
+ hsr_netlink_exit();
+ dev_remove_pack(&hsr_pt);
+}
+
+module_init(hsr_init);
+module_exit(hsr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
new file mode 100644
index 000000000000..56fe060c0ab1
--- /dev/null
+++ b/net/hsr/hsr_main.h
@@ -0,0 +1,166 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ */
+
+#ifndef _HSR_PRIVATE_H
+#define _HSR_PRIVATE_H
+
+#include <linux/netdevice.h>
+#include <linux/list.h>
+
+
+/* Time constants as specified in the HSR specification (IEC-62439-3 2010)
+ * Table 8.
+ * All values in milliseconds.
+ */
+#define HSR_LIFE_CHECK_INTERVAL 2000 /* ms */
+#define HSR_NODE_FORGET_TIME 60000 /* ms */
+#define HSR_ANNOUNCE_INTERVAL 100 /* ms */
+
+
+/* By how much may slave1 and slave2 timestamps of latest received frame from
+ * each node differ before we notify of communication problem?
+ */
+#define MAX_SLAVE_DIFF 3000 /* ms */
+
+
+/* How often shall we check for broken ring and remove node entries older than
+ * HSR_NODE_FORGET_TIME?
+ */
+#define PRUNE_PERIOD 3000 /* ms */
+
+
+#define HSR_TLV_ANNOUNCE 22
+#define HSR_TLV_LIFE_CHECK 23
+
+
+/* HSR Tag.
+ * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB,
+ * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest,
+ * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr,
+ * encapsulated protocol } instead.
+ */
+#define HSR_TAGLEN 6
+
+/* Field names below as defined in the IEC:2010 standard for HSR. */
+struct hsr_tag {
+ __be16 path_and_LSDU_size;
+ __be16 sequence_nr;
+ __be16 encap_proto;
+} __packed;
+
+
+/* The helper functions below assumes that 'path' occupies the 4 most
+ * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or
+ * equivalently, the 4 most significant bits of HSR tag byte 14).
+ *
+ * This is unclear in the IEC specification; its definition of MAC addresses
+ * indicates the spec is written with the least significant bit first (to the
+ * left). This, however, would mean that the LSDU field would be split in two
+ * with the path field in-between, which seems strange. I'm guessing the MAC
+ * address definition is in error.
+ */
+static inline u16 get_hsr_tag_path(struct hsr_tag *ht)
+{
+ return ntohs(ht->path_and_LSDU_size) >> 12;
+}
+
+static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht)
+{
+ return ntohs(ht->path_and_LSDU_size) & 0x0FFF;
+}
+
+static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path)
+{
+ ht->path_and_LSDU_size = htons(
+ (ntohs(ht->path_and_LSDU_size) & 0x0FFF) | (path << 12));
+}
+
+static inline void set_hsr_tag_LSDU_size(struct hsr_tag *ht, u16 LSDU_size)
+{
+ ht->path_and_LSDU_size = htons(
+ (ntohs(ht->path_and_LSDU_size) & 0xF000) |
+ (LSDU_size & 0x0FFF));
+}
+
+struct hsr_ethhdr {
+ struct ethhdr ethhdr;
+ struct hsr_tag hsr_tag;
+} __packed;
+
+
+/* HSR Supervision Frame data types.
+ * Field names as defined in the IEC:2010 standard for HSR.
+ */
+struct hsr_sup_tag {
+ __be16 path_and_HSR_Ver;
+ __be16 sequence_nr;
+ __u8 HSR_TLV_Type;
+ __u8 HSR_TLV_Length;
+} __packed;
+
+struct hsr_sup_payload {
+ unsigned char MacAddressA[ETH_ALEN];
+} __packed;
+
+static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst)
+{
+ return get_hsr_tag_path((struct hsr_tag *) hst);
+}
+
+static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst)
+{
+ return get_hsr_tag_LSDU_size((struct hsr_tag *) hst);
+}
+
+static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path)
+{
+ set_hsr_tag_path((struct hsr_tag *) hst, path);
+}
+
+static inline void set_hsr_stag_HSR_Ver(struct hsr_sup_tag *hst, u16 HSR_Ver)
+{
+ set_hsr_tag_LSDU_size((struct hsr_tag *) hst, HSR_Ver);
+}
+
+struct hsr_ethhdr_sp {
+ struct ethhdr ethhdr;
+ struct hsr_sup_tag hsr_sup;
+} __packed;
+
+
+enum hsr_dev_idx {
+ HSR_DEV_NONE = -1,
+ HSR_DEV_SLAVE_A = 0,
+ HSR_DEV_SLAVE_B,
+ HSR_DEV_MASTER,
+};
+#define HSR_MAX_SLAVE (HSR_DEV_SLAVE_B + 1)
+#define HSR_MAX_DEV (HSR_DEV_MASTER + 1)
+
+struct hsr_priv {
+ struct list_head hsr_list; /* List of hsr devices */
+ struct rcu_head rcu_head;
+ struct net_device *dev;
+ struct net_device *slave[HSR_MAX_SLAVE];
+ struct list_head node_db; /* Other HSR nodes */
+ struct list_head self_node_db; /* MACs of slaves */
+ struct timer_list announce_timer; /* Supervision frame dispatch */
+ int announce_count;
+ u16 sequence_nr;
+ spinlock_t seqnr_lock; /* locking for sequence_nr */
+ unsigned char sup_multicast_addr[ETH_ALEN];
+};
+
+void register_hsr_master(struct hsr_priv *hsr_priv);
+void unregister_hsr_master(struct hsr_priv *hsr_priv);
+bool is_hsr_slave(struct net_device *dev);
+
+#endif /* _HSR_PRIVATE_H */
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
new file mode 100644
index 000000000000..4e66bf61f585
--- /dev/null
+++ b/net/hsr/hsr_netlink.c
@@ -0,0 +1,457 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ *
+ * Routines for handling Netlink messages for HSR.
+ */
+
+#include "hsr_netlink.h"
+#include <linux/kernel.h>
+#include <net/rtnetlink.h>
+#include <net/genetlink.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_framereg.h"
+
+static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = {
+ [IFLA_HSR_SLAVE1] = { .type = NLA_U32 },
+ [IFLA_HSR_SLAVE2] = { .type = NLA_U32 },
+ [IFLA_HSR_MULTICAST_SPEC] = { .type = NLA_U8 },
+};
+
+
+/* Here, it seems a netdevice has already been allocated for us, and the
+ * hsr_dev_setup routine has been executed. Nice!
+ */
+static int hsr_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net_device *link[2];
+ unsigned char multicast_spec;
+
+ if (!data[IFLA_HSR_SLAVE1]) {
+ netdev_info(dev, "IFLA_HSR_SLAVE1 missing!\n");
+ return -EINVAL;
+ }
+ link[0] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE1]));
+ if (!data[IFLA_HSR_SLAVE2]) {
+ netdev_info(dev, "IFLA_HSR_SLAVE2 missing!\n");
+ return -EINVAL;
+ }
+ link[1] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE2]));
+
+ if (!link[0] || !link[1])
+ return -ENODEV;
+ if (link[0] == link[1])
+ return -EINVAL;
+
+ if (!data[IFLA_HSR_MULTICAST_SPEC])
+ multicast_spec = 0;
+ else
+ multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]);
+
+ return hsr_dev_finalize(dev, link, multicast_spec);
+}
+
+static struct rtnl_link_ops hsr_link_ops __read_mostly = {
+ .kind = "hsr",
+ .maxtype = IFLA_HSR_MAX,
+ .policy = hsr_policy,
+ .priv_size = sizeof(struct hsr_priv),
+ .setup = hsr_dev_setup,
+ .newlink = hsr_newlink,
+};
+
+
+
+/* attribute policy */
+/* NLA_BINARY missing in libnl; use NLA_UNSPEC in userspace instead. */
+static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
+ [HSR_A_NODE_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN },
+ [HSR_A_NODE_ADDR_B] = { .type = NLA_BINARY, .len = ETH_ALEN },
+ [HSR_A_IFINDEX] = { .type = NLA_U32 },
+ [HSR_A_IF1_AGE] = { .type = NLA_U32 },
+ [HSR_A_IF2_AGE] = { .type = NLA_U32 },
+ [HSR_A_IF1_SEQ] = { .type = NLA_U16 },
+ [HSR_A_IF2_SEQ] = { .type = NLA_U16 },
+};
+
+static struct genl_family hsr_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "HSR",
+ .version = 1,
+ .maxattr = HSR_A_MAX,
+};
+
+static struct genl_multicast_group hsr_network_genl_mcgrp = {
+ .name = "hsr-network",
+};
+
+
+
+/* This is called if for some node with MAC address addr, we only get frames
+ * over one of the slave interfaces. This would indicate an open network ring
+ * (i.e. a link has failed somewhere).
+ */
+void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN],
+ enum hsr_dev_idx dev_idx)
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ int res;
+ int ifindex;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ goto fail;
+
+ msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, HSR_C_RING_ERROR);
+ if (!msg_head)
+ goto nla_put_failure;
+
+ res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+ if (res < 0)
+ goto nla_put_failure;
+
+ if (hsr_priv->slave[dev_idx])
+ ifindex = hsr_priv->slave[dev_idx]->ifindex;
+ else
+ ifindex = -1;
+ res = nla_put_u32(skb, HSR_A_IFINDEX, ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, msg_head);
+ genlmsg_multicast(skb, 0, hsr_network_genl_mcgrp.id, GFP_ATOMIC);
+
+ return;
+
+nla_put_failure:
+ kfree_skb(skb);
+
+fail:
+ netdev_warn(hsr_priv->dev, "Could not send HSR ring error message\n");
+}
+
+/* This is called when we haven't heard from the node with MAC address addr for
+ * some time (just before the node is removed from the node table/list).
+ */
+void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN])
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ int res;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ goto fail;
+
+ msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, HSR_C_NODE_DOWN);
+ if (!msg_head)
+ goto nla_put_failure;
+
+
+ res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+ if (res < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, msg_head);
+ genlmsg_multicast(skb, 0, hsr_network_genl_mcgrp.id, GFP_ATOMIC);
+
+ return;
+
+nla_put_failure:
+ kfree_skb(skb);
+
+fail:
+ netdev_warn(hsr_priv->dev, "Could not send HSR node down\n");
+}
+
+
+/* HSR_C_GET_NODE_STATUS lets userspace query the internal HSR node table
+ * about the status of a specific node in the network, defined by its MAC
+ * address.
+ *
+ * Input: hsr ifindex, node mac address
+ * Output: hsr ifindex, node mac address (copied from request),
+ * age of latest frame from node over slave 1, slave 2 [ms]
+ */
+static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
+{
+ /* For receiving */
+ struct nlattr *na;
+ struct net_device *hsr_dev;
+
+ /* For sending */
+ struct sk_buff *skb_out;
+ void *msg_head;
+ struct hsr_priv *hsr_priv;
+ unsigned char hsr_node_addr_b[ETH_ALEN];
+ int hsr_node_if1_age;
+ u16 hsr_node_if1_seq;
+ int hsr_node_if2_age;
+ u16 hsr_node_if2_seq;
+ int addr_b_ifindex;
+ int res;
+
+ if (!info)
+ goto invalid;
+
+ na = info->attrs[HSR_A_IFINDEX];
+ if (!na)
+ goto invalid;
+ na = info->attrs[HSR_A_NODE_ADDR];
+ if (!na)
+ goto invalid;
+
+ hsr_dev = __dev_get_by_index(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ if (!hsr_dev)
+ goto invalid;
+ if (!is_hsr_master(hsr_dev))
+ goto invalid;
+
+
+ /* Send reply */
+
+ skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb_out) {
+ res = -ENOMEM;
+ goto fail;
+ }
+
+ msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
+ info->snd_seq, &hsr_genl_family, 0,
+ HSR_C_SET_NODE_STATUS);
+ if (!msg_head) {
+ res = -ENOMEM;
+ goto nla_put_failure;
+ }
+
+ res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ hsr_priv = netdev_priv(hsr_dev);
+ res = hsr_get_node_data(hsr_priv,
+ (unsigned char *) nla_data(info->attrs[HSR_A_NODE_ADDR]),
+ hsr_node_addr_b,
+ &addr_b_ifindex,
+ &hsr_node_if1_age,
+ &hsr_node_if1_seq,
+ &hsr_node_if2_age,
+ &hsr_node_if2_seq);
+ if (res < 0)
+ goto fail;
+
+ res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN,
+ nla_data(info->attrs[HSR_A_NODE_ADDR]));
+ if (res < 0)
+ goto nla_put_failure;
+
+ if (addr_b_ifindex > -1) {
+ res = nla_put(skb_out, HSR_A_NODE_ADDR_B, ETH_ALEN,
+ hsr_node_addr_b);
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put_u32(skb_out, HSR_A_ADDR_B_IFINDEX, addr_b_ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+ }
+
+ res = nla_put_u32(skb_out, HSR_A_IF1_AGE, hsr_node_if1_age);
+ if (res < 0)
+ goto nla_put_failure;
+ res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq);
+ if (res < 0)
+ goto nla_put_failure;
+ if (hsr_priv->slave[0])
+ res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX,
+ hsr_priv->slave[0]->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put_u32(skb_out, HSR_A_IF2_AGE, hsr_node_if2_age);
+ if (res < 0)
+ goto nla_put_failure;
+ res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq);
+ if (res < 0)
+ goto nla_put_failure;
+ if (hsr_priv->slave[1])
+ res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX,
+ hsr_priv->slave[1]->ifindex);
+
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
+
+ return 0;
+
+invalid:
+ netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL);
+ return 0;
+
+nla_put_failure:
+ kfree_skb(skb_out);
+ /* Fall through */
+
+fail:
+ return res;
+}
+
+static struct genl_ops hsr_ops_get_node_status = {
+ .cmd = HSR_C_GET_NODE_STATUS,
+ .flags = 0,
+ .policy = hsr_genl_policy,
+ .doit = hsr_get_node_status,
+ .dumpit = NULL,
+};
+
+
+/* Get a list of MacAddressA of all nodes known to this node (other than self).
+ */
+static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
+{
+ /* For receiving */
+ struct nlattr *na;
+ struct net_device *hsr_dev;
+
+ /* For sending */
+ struct sk_buff *skb_out;
+ void *msg_head;
+ struct hsr_priv *hsr_priv;
+ void *pos;
+ unsigned char addr[ETH_ALEN];
+ int res;
+
+ if (!info)
+ goto invalid;
+
+ na = info->attrs[HSR_A_IFINDEX];
+ if (!na)
+ goto invalid;
+
+ hsr_dev = __dev_get_by_index(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ if (!hsr_dev)
+ goto invalid;
+ if (!is_hsr_master(hsr_dev))
+ goto invalid;
+
+
+ /* Send reply */
+
+ skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb_out) {
+ res = -ENOMEM;
+ goto fail;
+ }
+
+ msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
+ info->snd_seq, &hsr_genl_family, 0,
+ HSR_C_SET_NODE_LIST);
+ if (!msg_head) {
+ res = -ENOMEM;
+ goto nla_put_failure;
+ }
+
+ res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ hsr_priv = netdev_priv(hsr_dev);
+
+ rcu_read_lock();
+ pos = hsr_get_next_node(hsr_priv, NULL, addr);
+ while (pos) {
+ res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+ if (res < 0) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+ pos = hsr_get_next_node(hsr_priv, pos, addr);
+ }
+ rcu_read_unlock();
+
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
+
+ return 0;
+
+invalid:
+ netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL);
+ return 0;
+
+nla_put_failure:
+ kfree_skb(skb_out);
+ /* Fall through */
+
+fail:
+ return res;
+}
+
+
+static struct genl_ops hsr_ops_get_node_list = {
+ .cmd = HSR_C_GET_NODE_LIST,
+ .flags = 0,
+ .policy = hsr_genl_policy,
+ .doit = hsr_get_node_list,
+ .dumpit = NULL,
+};
+
+int __init hsr_netlink_init(void)
+{
+ int rc;
+
+ rc = rtnl_link_register(&hsr_link_ops);
+ if (rc)
+ goto fail_rtnl_link_register;
+
+ rc = genl_register_family(&hsr_genl_family);
+ if (rc)
+ goto fail_genl_register_family;
+
+ rc = genl_register_ops(&hsr_genl_family, &hsr_ops_get_node_status);
+ if (rc)
+ goto fail_genl_register_ops;
+
+ rc = genl_register_ops(&hsr_genl_family, &hsr_ops_get_node_list);
+ if (rc)
+ goto fail_genl_register_ops_node_list;
+
+ rc = genl_register_mc_group(&hsr_genl_family, &hsr_network_genl_mcgrp);
+ if (rc)
+ goto fail_genl_register_mc_group;
+
+ return 0;
+
+fail_genl_register_mc_group:
+ genl_unregister_ops(&hsr_genl_family, &hsr_ops_get_node_list);
+fail_genl_register_ops_node_list:
+ genl_unregister_ops(&hsr_genl_family, &hsr_ops_get_node_status);
+fail_genl_register_ops:
+ genl_unregister_family(&hsr_genl_family);
+fail_genl_register_family:
+ rtnl_link_unregister(&hsr_link_ops);
+fail_rtnl_link_register:
+
+ return rc;
+}
+
+void __exit hsr_netlink_exit(void)
+{
+ genl_unregister_mc_group(&hsr_genl_family, &hsr_network_genl_mcgrp);
+ genl_unregister_ops(&hsr_genl_family, &hsr_ops_get_node_status);
+ genl_unregister_family(&hsr_genl_family);
+
+ rtnl_link_unregister(&hsr_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("hsr");
diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h
new file mode 100644
index 000000000000..d4579dcc3c7d
--- /dev/null
+++ b/net/hsr/hsr_netlink.h
@@ -0,0 +1,30 @@
+/* Copyright 2011-2013 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com
+ */
+
+#ifndef __HSR_NETLINK_H
+#define __HSR_NETLINK_H
+
+#include <linux/if_ether.h>
+#include <linux/module.h>
+#include <uapi/linux/hsr_netlink.h>
+
+struct hsr_priv;
+
+int __init hsr_netlink_init(void);
+void __exit hsr_netlink_exit(void);
+
+void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN],
+ int dev_idx);
+void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN]);
+void hsr_nl_framedrop(int dropcount, int dev_idx);
+void hsr_nl_linkdown(int dev_idx);
+
+#endif /* __HSR_NETLINK_H */
diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index fde90e63027d..9497c6f3276b 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -654,7 +654,9 @@ static int lowpan_header_create(struct sk_buff *skb,
head[1] = iphc1;
skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_reset_transport_header(skb);
memcpy(skb_push(skb, hc06_ptr - head), head, hc06_ptr - head);
+ skb_reset_network_header(skb);
lowpan_raw_dump_table(__func__, "raw skb data dump", skb->data,
skb->len);
@@ -737,7 +739,6 @@ static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr)
return -ENOMEM;
skb_push(new, sizeof(struct ipv6hdr));
- skb_reset_network_header(new);
skb_copy_to_linear_data(new, hdr, sizeof(struct ipv6hdr));
new->protocol = htons(ETH_P_IPV6);
@@ -1059,7 +1060,6 @@ lowpan_process_data(struct sk_buff *skb)
skb = new;
skb_push(skb, sizeof(struct udphdr));
- skb_reset_transport_header(skb);
skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr));
lowpan_raw_dump_table(__func__, "raw UDP header dump",
@@ -1102,17 +1102,6 @@ static int lowpan_set_address(struct net_device *dev, void *p)
return 0;
}
-static int lowpan_get_mac_header_length(struct sk_buff *skb)
-{
- /*
- * Currently long addressing mode is supported only, so the overall
- * header size is 21:
- * FC SeqNum DPAN DA SA Sec
- * 2 + 1 + 2 + 8 + 8 + 0 = 21
- */
- return 21;
-}
-
static int
lowpan_fragment_xmit(struct sk_buff *skb, u8 *head,
int mlen, int plen, int offset, int type)
@@ -1133,12 +1122,15 @@ lowpan_fragment_xmit(struct sk_buff *skb, u8 *head,
frag->priority = skb->priority;
/* copy header, MFR and payload */
- memcpy(skb_put(frag, mlen), skb->data, mlen);
- memcpy(skb_put(frag, hlen), head, hlen);
+ skb_put(frag, mlen);
+ skb_copy_to_linear_data(frag, skb_mac_header(skb), mlen);
+
+ skb_put(frag, hlen);
+ skb_copy_to_linear_data_offset(frag, mlen, head, hlen);
- if (plen)
- skb_copy_from_linear_data_offset(skb, offset + mlen,
- skb_put(frag, plen), plen);
+ skb_put(frag, plen);
+ skb_copy_to_linear_data_offset(frag, mlen + hlen,
+ skb_network_header(skb) + offset, plen);
lowpan_raw_dump_table(__func__, " raw fragment dump", frag->data,
frag->len);
@@ -1152,7 +1144,7 @@ lowpan_skb_fragmentation(struct sk_buff *skb, struct net_device *dev)
int err, header_length, payload_length, tag, offset = 0;
u8 head[5];
- header_length = lowpan_get_mac_header_length(skb);
+ header_length = skb->mac_len;
payload_length = skb->len - header_length;
tag = lowpan_dev_info(dev)->fragment_tag++;
@@ -1323,8 +1315,6 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev,
/* Pull off the 1-byte of 6lowpan header. */
skb_pull(local_skb, 1);
- skb_reset_network_header(local_skb);
- skb_set_transport_header(local_skb, sizeof(struct ipv6hdr));
lowpan_give_skb_to_devices(local_skb);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 109ee89f123e..7785b28061ac 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -121,7 +121,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
struct aead_givcrypt_request *req;
struct scatterlist *sg;
struct scatterlist *asg;
- struct esp_data *esp;
struct sk_buff *trailer;
void *tmp;
u8 *iv;
@@ -139,8 +138,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
/* skb is pure payload to encrypt */
- esp = x->data;
- aead = esp->aead;
+ aead = x->data;
alen = crypto_aead_authsize(aead);
tfclen = 0;
@@ -154,8 +152,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
}
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(skb->len + 2 + tfclen, blksize);
- if (esp->padlen)
- clen = ALIGN(clen, esp->padlen);
plen = clen - skb->len - tfclen;
err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
@@ -280,8 +276,7 @@ static int esp_input_done2(struct sk_buff *skb, int err)
{
const struct iphdr *iph;
struct xfrm_state *x = xfrm_input_state(skb);
- struct esp_data *esp = x->data;
- struct crypto_aead *aead = esp->aead;
+ struct crypto_aead *aead = x->data;
int alen = crypto_aead_authsize(aead);
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
int elen = skb->len - hlen;
@@ -376,8 +371,7 @@ static void esp_input_done(struct crypto_async_request *base, int err)
static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
struct ip_esp_hdr *esph;
- struct esp_data *esp = x->data;
- struct crypto_aead *aead = esp->aead;
+ struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
@@ -459,9 +453,8 @@ out:
static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
{
- struct esp_data *esp = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
- u32 align = max_t(u32, blksize, esp->padlen);
+ struct crypto_aead *aead = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
unsigned int net_adj;
switch (x->props.mode) {
@@ -476,8 +469,8 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
BUG();
}
- return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
- net_adj) & ~(align - 1)) + net_adj - 2;
+ return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+ net_adj) & ~(blksize - 1)) + net_adj - 2;
}
static void esp4_err(struct sk_buff *skb, u32 info)
@@ -511,18 +504,16 @@ static void esp4_err(struct sk_buff *skb, u32 info)
static void esp_destroy(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
+ struct crypto_aead *aead = x->data;
- if (!esp)
+ if (!aead)
return;
- crypto_free_aead(esp->aead);
- kfree(esp);
+ crypto_free_aead(aead);
}
static int esp_init_aead(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
struct crypto_aead *aead;
int err;
@@ -531,7 +522,7 @@ static int esp_init_aead(struct xfrm_state *x)
if (IS_ERR(aead))
goto error;
- esp->aead = aead;
+ x->data = aead;
err = crypto_aead_setkey(aead, x->aead->alg_key,
(x->aead->alg_key_len + 7) / 8);
@@ -548,7 +539,6 @@ error:
static int esp_init_authenc(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
struct crypto_aead *aead;
struct crypto_authenc_key_param *param;
struct rtattr *rta;
@@ -583,7 +573,7 @@ static int esp_init_authenc(struct xfrm_state *x)
if (IS_ERR(aead))
goto error;
- esp->aead = aead;
+ x->data = aead;
keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
(x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
@@ -638,16 +628,11 @@ error:
static int esp_init_state(struct xfrm_state *x)
{
- struct esp_data *esp;
struct crypto_aead *aead;
u32 align;
int err;
- esp = kzalloc(sizeof(*esp), GFP_KERNEL);
- if (esp == NULL)
- return -ENOMEM;
-
- x->data = esp;
+ x->data = NULL;
if (x->aead)
err = esp_init_aead(x);
@@ -657,9 +642,7 @@ static int esp_init_state(struct xfrm_state *x)
if (err)
goto error;
- aead = esp->aead;
-
- esp->padlen = 0;
+ aead = x->data;
x->props.header_len = sizeof(struct ip_esp_hdr) +
crypto_aead_ivsize(aead);
@@ -683,9 +666,7 @@ static int esp_init_state(struct xfrm_state *x)
}
align = ALIGN(crypto_aead_blocksize(aead), 4);
- if (esp->padlen)
- align = max_t(u32, align, esp->padlen);
- x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
error:
return err;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 85a4f21aac1a..59da7cde0724 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -271,6 +271,11 @@ unsigned int arpt_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
table_base = private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d23118d95ff9..718dfbd30cbe 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -327,6 +327,11 @@ ipt_do_table(struct sk_buff *skb,
addend = xt_write_recseq_begin();
private = table->private;
cpu = smp_processor_id();
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
table_base = private->entries[cpu];
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
stackptr = per_cpu_ptr(private->stackptr, cpu);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index a2e2b61cd7da..2510c02c2d21 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -28,6 +28,7 @@
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/checksum.h>
#include <net/ip.h>
@@ -57,15 +58,21 @@ struct clusterip_config {
struct rcu_head rcu;
};
-static LIST_HEAD(clusterip_configs);
+#ifdef CONFIG_PROC_FS
+static const struct file_operations clusterip_proc_fops;
+#endif
-/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_SPINLOCK(clusterip_lock);
+static int clusterip_net_id __read_mostly;
+
+struct clusterip_net {
+ struct list_head configs;
+ /* lock protects the configs list */
+ spinlock_t lock;
#ifdef CONFIG_PROC_FS
-static const struct file_operations clusterip_proc_fops;
-static struct proc_dir_entry *clusterip_procdir;
+ struct proc_dir_entry *procdir;
#endif
+};
static inline void
clusterip_config_get(struct clusterip_config *c)
@@ -92,10 +99,13 @@ clusterip_config_put(struct clusterip_config *c)
static inline void
clusterip_config_entry_put(struct clusterip_config *c)
{
+ struct net *net = dev_net(c->dev);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
local_bh_disable();
- if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+ if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
list_del_rcu(&c->list);
- spin_unlock(&clusterip_lock);
+ spin_unlock(&cn->lock);
local_bh_enable();
dev_mc_del(c->dev, c->clustermac);
@@ -113,11 +123,12 @@ clusterip_config_entry_put(struct clusterip_config *c)
}
static struct clusterip_config *
-__clusterip_config_find(__be32 clusterip)
+__clusterip_config_find(struct net *net, __be32 clusterip)
{
struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
- list_for_each_entry_rcu(c, &clusterip_configs, list) {
+ list_for_each_entry_rcu(c, &cn->configs, list) {
if (c->clusterip == clusterip)
return c;
}
@@ -126,12 +137,12 @@ __clusterip_config_find(__be32 clusterip)
}
static inline struct clusterip_config *
-clusterip_config_find_get(__be32 clusterip, int entry)
+clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
{
struct clusterip_config *c;
rcu_read_lock_bh();
- c = __clusterip_config_find(clusterip);
+ c = __clusterip_config_find(net, clusterip);
if (c) {
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
@@ -158,6 +169,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
struct net_device *dev)
{
struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
@@ -180,7 +192,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
/* create proc dir entry */
sprintf(buffer, "%pI4", &ip);
c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
- clusterip_procdir,
+ cn->procdir,
&clusterip_proc_fops, c);
if (!c->pde) {
kfree(c);
@@ -189,9 +201,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
}
#endif
- spin_lock_bh(&clusterip_lock);
- list_add_rcu(&c->list, &clusterip_configs);
- spin_unlock_bh(&clusterip_lock);
+ spin_lock_bh(&cn->lock);
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
return c;
}
@@ -370,7 +382,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
/* FIXME: further sanity checks */
- config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
+ config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
if (!config) {
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
pr_info("no config found for %pI4, need 'new'\n",
@@ -384,7 +396,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- dev = dev_get_by_name(&init_net, e->ip.iniface);
+ dev = dev_get_by_name(par->net, e->ip.iniface);
if (!dev) {
pr_info("no such interface %s\n",
e->ip.iniface);
@@ -492,6 +504,7 @@ arp_mangle(const struct nf_hook_ops *ops,
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
+ struct net *net = dev_net(in ? in : out);
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -508,7 +521,7 @@ arp_mangle(const struct nf_hook_ops *ops,
/* if there is no clusterip configuration for the arp reply's
* source ip, we don't want to mangle it */
- c = clusterip_config_find_get(payload->src_ip, 0);
+ c = clusterip_config_find_get(net, payload->src_ip, 0);
if (!c)
return NF_ACCEPT;
@@ -698,48 +711,75 @@ static const struct file_operations clusterip_proc_fops = {
#endif /* CONFIG_PROC_FS */
+static int clusterip_net_init(struct net *net)
+{
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ INIT_LIST_HEAD(&cn->configs);
+
+ spin_lock_init(&cn->lock);
+
+#ifdef CONFIG_PROC_FS
+ cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
+ if (!cn->procdir) {
+ pr_err("Unable to proc dir entry\n");
+ return -ENOMEM;
+ }
+#endif /* CONFIG_PROC_FS */
+
+ return 0;
+}
+
+static void clusterip_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ proc_remove(cn->procdir);
+#endif
+}
+
+static struct pernet_operations clusterip_net_ops = {
+ .init = clusterip_net_init,
+ .exit = clusterip_net_exit,
+ .id = &clusterip_net_id,
+ .size = sizeof(struct clusterip_net),
+};
+
static int __init clusterip_tg_init(void)
{
int ret;
- ret = xt_register_target(&clusterip_tg_reg);
+ ret = register_pernet_subsys(&clusterip_net_ops);
if (ret < 0)
return ret;
+ ret = xt_register_target(&clusterip_tg_reg);
+ if (ret < 0)
+ goto cleanup_subsys;
+
ret = nf_register_hook(&cip_arp_ops);
if (ret < 0)
goto cleanup_target;
-#ifdef CONFIG_PROC_FS
- clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
- if (!clusterip_procdir) {
- pr_err("Unable to proc dir entry\n");
- ret = -ENOMEM;
- goto cleanup_hook;
- }
-#endif /* CONFIG_PROC_FS */
-
pr_info("ClusterIP Version %s loaded successfully\n",
CLUSTERIP_VERSION);
+
return 0;
-#ifdef CONFIG_PROC_FS
-cleanup_hook:
- nf_unregister_hook(&cip_arp_ops);
-#endif /* CONFIG_PROC_FS */
cleanup_target:
xt_unregister_target(&clusterip_tg_reg);
+cleanup_subsys:
+ unregister_pernet_subsys(&clusterip_net_ops);
return ret;
}
static void __exit clusterip_tg_exit(void)
{
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
-#ifdef CONFIG_PROC_FS
- proc_remove(clusterip_procdir);
-#endif
+
nf_unregister_hook(&cip_arp_ops);
xt_unregister_target(&clusterip_tg_reg);
+ unregister_pernet_subsys(&clusterip_net_ops);
/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
rcu_barrier_bh();
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index cbc22158af49..9cb993cd224b 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -220,6 +220,7 @@ static void ipt_ulog_packet(struct net *net,
ub->qlen++;
pm = nlmsg_data(nlh);
+ memset(pm, 0, sizeof(*pm));
/* We might not have a timestamp, get one */
if (skb->tstamp.tv64 == 0)
@@ -238,8 +239,6 @@ static void ipt_ulog_packet(struct net *net,
}
else if (loginfo->prefix[0] != '\0')
strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
- else
- *(pm->prefix) = '\0';
if (in && in->hard_header_len > 0 &&
skb->mac_header != skb->network_header &&
@@ -251,13 +250,9 @@ static void ipt_ulog_packet(struct net *net,
if (in)
strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
- else
- pm->indev_name[0] = '\0';
if (out)
strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
- else
- pm->outdev_name[0] = '\0';
/* copy_len <= skb->len, so can't fail. */
if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 8f7536be1322..0f4cbfeb19bd 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -16,7 +16,6 @@
#include <net/netfilter/nf_tables.h>
#include <net/net_namespace.h>
#include <net/ip.h>
-#include <net/net_namespace.h>
#include <net/netfilter/nf_tables_ipv4.h>
static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d5b1390eebbe..3d69ec8dac57 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -701,13 +701,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_allowed_congestion_control,
},
{
- .procname = "tcp_max_ssthresh",
- .data = &sysctl_tcp_max_ssthresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "tcp_thin_linear_timeouts",
.data = &sysctl_tcp_thin_linear_timeouts,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index f45e1c242440..821846fb0a7e 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -140,7 +140,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -149,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 019c2389a341..ad37bf18ae4b 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -15,8 +15,6 @@
#include <linux/gfp.h>
#include <net/tcp.h>
-int sysctl_tcp_max_ssthresh = 0;
-
static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
@@ -299,35 +297,24 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
}
EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
-/*
- * Slow start is used when congestion window is less than slow start
- * threshold. This version implements the basic RFC2581 version
- * and optionally supports:
- * RFC3742 Limited Slow Start - growth limited to max_ssthresh
- * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-void tcp_slow_start(struct tcp_sock *tp)
+int tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- int cnt; /* increase in packets */
- unsigned int delta = 0;
- u32 snd_cwnd = tp->snd_cwnd;
-
- if (unlikely(!snd_cwnd)) {
- pr_err_once("snd_cwnd is nul, please report this bug.\n");
- snd_cwnd = 1U;
- }
+ u32 cwnd = tp->snd_cwnd + acked;
- if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
- cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
- else
- cnt = snd_cwnd; /* exponential increase */
-
- tp->snd_cwnd_cnt += cnt;
- while (tp->snd_cwnd_cnt >= snd_cwnd) {
- tp->snd_cwnd_cnt -= snd_cwnd;
- delta++;
- }
- tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp);
+ if (cwnd > tp->snd_ssthresh)
+ cwnd = tp->snd_ssthresh + 1;
+ acked -= cwnd - tp->snd_cwnd;
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+ return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
@@ -351,7 +338,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -360,7 +347,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
/* In dangerous area, increase slowly. */
else
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index b6ae92a51f58..828e4c3ffbaf 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -304,7 +304,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -315,7 +316,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (tp->snd_cwnd <= tp->snd_ssthresh) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 766032b4a6c3..f195d9316e55 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -8,7 +8,7 @@
#include <net/inetpeer.h>
#include <net/tcp.h>
-int sysctl_tcp_fastopen __read_mostly;
+int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 30f27f6b3655..8ed9305dfdf4 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,7 +109,7 @@ static void hstcp_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
@@ -118,7 +118,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
/* Update AIMD parameters.
*
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index c1a8175361e8..4a194acfd923 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -227,7 +227,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
@@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 57bdd17dff4d..478fe82611bf 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -85,7 +85,8 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
@@ -102,7 +103,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (!ca->hybla_en) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
return;
}
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 834857f3c871..8a520996f3d2 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -256,7 +256,8 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
/*
* Increase window in response to successful acknowledgment.
*/
-static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
@@ -270,7 +271,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* In slow start */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
u32 delta;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b935397c703c..c53b7f35c51d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2903,7 +2903,8 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* left edge of the send window.
* See draft-ietf-tcplw-high-performance-00, section 3.3.
*/
- if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ flag & FLAG_ACKED)
seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
if (seq_rtt < 0)
@@ -2918,20 +2919,25 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
}
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
-static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
+static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
{
struct tcp_sock *tp = tcp_sk(sk);
s32 seq_rtt = -1;
- if (tp->lsndtime && !tp->total_retrans)
- seq_rtt = tcp_time_stamp - tp->lsndtime;
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
+ if (synack_stamp && !tp->total_retrans)
+ seq_rtt = tcp_time_stamp - synack_stamp;
+
+ /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
+ * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
+ */
+ if (!tp->srtt)
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
}
-static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -3028,6 +3034,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
s32 seq_rtt = -1;
s32 ca_seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
+ bool rtt_update;
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -3104,14 +3111,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
- if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
- (flag & FLAG_ACKED))
- tcp_rearm_rto(sk);
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt);
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
+ tcp_rearm_rto(sk);
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
@@ -3150,6 +3156,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
}
+ } else if (skb && rtt_update && sack_rtt >= 0 &&
+ sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
+ /* Do not re-arm RTO if the sack RTT is measured from data sent
+ * after when the head was last (re)transmitted. Otherwise the
+ * timeout may continue to extend in loss recovery.
+ */
+ tcp_rearm_rto(sk);
}
#if FASTRETRANS_DEBUG > 0
@@ -3441,7 +3454,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* Advance cwnd if state allows */
if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, prior_in_flight);
+ tcp_cong_avoid(sk, ack, acked, prior_in_flight);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -5626,6 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct request_sock *req;
int queued = 0;
bool acceptable;
+ u32 synack_stamp;
tp->rx_opt.saw_tstamp = 0;
@@ -5708,9 +5722,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* so release it.
*/
if (req) {
+ synack_stamp = tcp_rsk(req)->snt_synack;
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
+ synack_stamp = tp->lsndtime;
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);
@@ -5733,7 +5749,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_synack_rtt_meas(sk, req);
+ tcp_synack_rtt_meas(sk, synack_stamp);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 72f7218b03f5..991d62a2f9bb 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -115,12 +115,13 @@ static void tcp_lp_init(struct sock *sk)
* Will only call newReno CA when away from inference.
* From TCP-LP's paper, this will be handled in additive increasement.
*/
-static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct lp *lp = inet_csk_ca(sk);
if (!(lp->flag & LP_WITHIN_INF))
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
}
/**
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index a7a5583eab04..a2b68a108eae 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -18,6 +18,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int sum_truesize = 0;
struct tcphdr *th;
unsigned int thlen;
unsigned int seq;
@@ -104,13 +105,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (copy_destructor) {
skb->destructor = gso_skb->destructor;
skb->sk = gso_skb->sk;
- /* {tcp|sock}_wfree() use exact truesize accounting :
- * sum(skb->truesize) MUST be exactly be gso_skb->truesize
- * So we account mss bytes of 'true size' for each segment.
- * The last segment will contain the remaining.
- */
- skb->truesize = mss;
- gso_skb->truesize -= mss;
+ sum_truesize += skb->truesize;
}
skb = skb->next;
th = tcp_hdr(skb);
@@ -127,7 +122,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (copy_destructor) {
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
- swap(gso_skb->truesize, skb->truesize);
+ sum_truesize += skb->truesize;
+ atomic_add(sum_truesize - gso_skb->truesize,
+ &skb->sk->sk_wmem_alloc);
}
delta = htonl(oldlen + (skb_tail_pointer(skb) -
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 8ce55b8aaec8..19ea6c2951f3 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,7 +15,8 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -23,7 +24,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 80fa2bfd7ede..06cae62bf208 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -163,13 +163,14 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
return min(tp->snd_ssthresh, tp->snd_cwnd-1);
}
-static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct vegas *vegas = inet_csk_ca(sk);
if (!vegas->doing_vegas_now) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
return;
}
@@ -194,7 +195,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
} else {
u32 rtt, diff;
u64 target_cwnd;
@@ -243,7 +244,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
/* Congestion avoidance. */
@@ -283,7 +284,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
}
/* Use normal slow start */
else if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
}
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index ac43cd747bce..326475a94865 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,13 +114,14 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
tcp_veno_init(sk);
}
-static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
if (!veno->doing_veno_now) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
return;
}
@@ -133,7 +134,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* We don't have enough rtt samples to do the Veno
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked, in_flight);
} else {
u64 target_cwnd;
u32 rtt;
@@ -152,7 +153,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
/* Congestion avoidance. */
if (veno->diff < beta) {
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 05c3b6f0e8e1..a347a078ee07 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -69,7 +69,8 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
}
-static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+ u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct yeah *yeah = inet_csk_ca(sk);
@@ -78,7 +79,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else if (!yeah->doing_reno_now) {
/* Scalable */
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index ccde54248c8c..e1a63930a967 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -104,10 +104,14 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
const struct iphdr *iph = ip_hdr(skb);
u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
struct flowi4 *fl4 = &fl->u.ip4;
+ int oif = 0;
+
+ if (skb_dst(skb))
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
- fl4->flowi4_oif = skb_dst(skb)->dev->ifindex;
+ fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
if (!ip_is_fragment(iph)) {
switch (iph->protocol) {
@@ -236,7 +240,7 @@ static struct dst_ops xfrm4_dst_ops = {
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = 1024,
+ .gc_thresh = 32768,
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index e67e63f9858d..b8719df0366e 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -164,10 +164,9 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
u8 *iv;
u8 *tail;
__be32 *seqhi;
- struct esp_data *esp = x->data;
/* skb is pure payload to encrypt */
- aead = esp->aead;
+ aead = x->data;
alen = crypto_aead_authsize(aead);
tfclen = 0;
@@ -181,8 +180,6 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
}
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(skb->len + 2 + tfclen, blksize);
- if (esp->padlen)
- clen = ALIGN(clen, esp->padlen);
plen = clen - skb->len - tfclen;
err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
@@ -271,8 +268,7 @@ error:
static int esp_input_done2(struct sk_buff *skb, int err)
{
struct xfrm_state *x = xfrm_input_state(skb);
- struct esp_data *esp = x->data;
- struct crypto_aead *aead = esp->aead;
+ struct crypto_aead *aead = x->data;
int alen = crypto_aead_authsize(aead);
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
int elen = skb->len - hlen;
@@ -325,8 +321,7 @@ static void esp_input_done(struct crypto_async_request *base, int err)
static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
{
struct ip_esp_hdr *esph;
- struct esp_data *esp = x->data;
- struct crypto_aead *aead = esp->aead;
+ struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
@@ -414,9 +409,8 @@ out:
static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
{
- struct esp_data *esp = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
- u32 align = max_t(u32, blksize, esp->padlen);
+ struct crypto_aead *aead = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
unsigned int net_adj;
if (x->props.mode != XFRM_MODE_TUNNEL)
@@ -424,8 +418,8 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
else
net_adj = 0;
- return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
- net_adj) & ~(align - 1)) + net_adj - 2;
+ return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+ net_adj) & ~(blksize - 1)) + net_adj - 2;
}
static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
@@ -454,18 +448,16 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
static void esp6_destroy(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
+ struct crypto_aead *aead = x->data;
- if (!esp)
+ if (!aead)
return;
- crypto_free_aead(esp->aead);
- kfree(esp);
+ crypto_free_aead(aead);
}
static int esp_init_aead(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
struct crypto_aead *aead;
int err;
@@ -474,7 +466,7 @@ static int esp_init_aead(struct xfrm_state *x)
if (IS_ERR(aead))
goto error;
- esp->aead = aead;
+ x->data = aead;
err = crypto_aead_setkey(aead, x->aead->alg_key,
(x->aead->alg_key_len + 7) / 8);
@@ -491,7 +483,6 @@ error:
static int esp_init_authenc(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
struct crypto_aead *aead;
struct crypto_authenc_key_param *param;
struct rtattr *rta;
@@ -526,7 +517,7 @@ static int esp_init_authenc(struct xfrm_state *x)
if (IS_ERR(aead))
goto error;
- esp->aead = aead;
+ x->data = aead;
keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
(x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
@@ -581,7 +572,6 @@ error:
static int esp6_init_state(struct xfrm_state *x)
{
- struct esp_data *esp;
struct crypto_aead *aead;
u32 align;
int err;
@@ -589,11 +579,7 @@ static int esp6_init_state(struct xfrm_state *x)
if (x->encap)
return -EINVAL;
- esp = kzalloc(sizeof(*esp), GFP_KERNEL);
- if (esp == NULL)
- return -ENOMEM;
-
- x->data = esp;
+ x->data = NULL;
if (x->aead)
err = esp_init_aead(x);
@@ -603,9 +589,7 @@ static int esp6_init_state(struct xfrm_state *x)
if (err)
goto error;
- aead = esp->aead;
-
- esp->padlen = 0;
+ aead = x->data;
x->props.header_len = sizeof(struct ip_esp_hdr) +
crypto_aead_ivsize(aead);
@@ -625,9 +609,7 @@ static int esp6_init_state(struct xfrm_state *x)
}
align = ALIGN(crypto_aead_blocksize(aead), 4);
- if (esp->padlen)
- align = max_t(u32, align, esp->padlen);
- x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
error:
return err;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 44400c216dc6..710238f58aa9 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -349,6 +349,11 @@ ip6t_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
cpu = smp_processor_id();
table_base = private->entries[cpu];
jumpstack = (struct ip6t_entry **)private->jumpstack[cpu];
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 56eef30ee5f6..da00a2ecde55 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -39,7 +39,7 @@ MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6");
MODULE_LICENSE("GPL");
/* Send RST reply */
-static void send_reset(struct net *net, struct sk_buff *oldskb)
+static void send_reset(struct net *net, struct sk_buff *oldskb, int hook)
{
struct sk_buff *nskb;
struct tcphdr otcph, *tcph;
@@ -88,8 +88,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb)
}
/* Check checksum. */
- if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
- skb_checksum(oldskb, tcphoff, otcplen, 0))) {
+ if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) {
pr_debug("TCP checksum is invalid\n");
return;
}
@@ -227,7 +226,7 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
/* Do nothing */
break;
case IP6T_TCP_RESET:
- send_reset(net, skb);
+ send_reset(net, skb, par->hooknum);
break;
default:
net_info_ratelimited("case %u not handled yet\n", reject->with);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1ac0b6e17d95..fd399ac6c1f7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1087,10 +1087,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev)))
return NULL;
- if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
- return dst;
+ if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+ return NULL;
- return NULL;
+ if (rt6_check_expired(rt))
+ return NULL;
+
+ return dst;
}
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 08ed2772b7aa..5f8e128c512d 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -135,10 +135,14 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
struct ipv6_opt_hdr *exthdr;
const unsigned char *nh = skb_network_header(skb);
u8 nexthdr = nh[IP6CB(skb)->nhoff];
+ int oif = 0;
+
+ if (skb_dst(skb))
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl6, 0, sizeof(struct flowi6));
fl6->flowi6_mark = skb->mark;
- fl6->flowi6_oif = skb_dst(skb)->dev->ifindex;
+ fl6->flowi6_oif = reverse ? skb->skb_iif : oif;
fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
@@ -285,7 +289,7 @@ static struct dst_ops xfrm6_dst_ops = {
.destroy = xfrm6_dst_destroy,
.ifdown = xfrm6_dst_ifdown,
.local_out = __ip6_local_out,
- .gc_thresh = 1024,
+ .gc_thresh = 32768,
};
static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c
index 2ca2f4dceab7..e24bcf977296 100644
--- a/net/mac802154/wpan.c
+++ b/net/mac802154/wpan.c
@@ -208,6 +208,8 @@ static int mac802154_header_create(struct sk_buff *skb,
head[1] = fc >> 8;
memcpy(skb_push(skb, pos), head, pos);
+ skb_reset_mac_header(skb);
+ skb->mac_len = pos;
return pos;
}
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index a13e15be7911..f2c7d83dc23f 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -198,13 +198,14 @@ mtype_list(const struct ip_set *set,
struct mtype *map = set->data;
struct nlattr *adt, *nested;
void *x;
- u32 id, first = cb->args[2];
+ u32 id, first = cb->args[IPSET_CB_ARG0];
adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!adt)
return -EMSGSIZE;
- for (; cb->args[2] < map->elements; cb->args[2]++) {
- id = cb->args[2];
+ for (; cb->args[IPSET_CB_ARG0] < map->elements;
+ cb->args[IPSET_CB_ARG0]++) {
+ id = cb->args[IPSET_CB_ARG0];
x = get_ext(set, map, id);
if (!test_bit(id, map->members) ||
(SET_WITH_TIMEOUT(set) &&
@@ -231,14 +232,14 @@ mtype_list(const struct ip_set *set,
ipset_nest_end(skb, adt);
/* Set listing finished */
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return 0;
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(id == first)) {
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return -EMSGSIZE;
}
ipset_nest_end(skb, adt);
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index e7603c5b53d7..cf99676e69f8 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -254,7 +254,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
return -ENOMEM;
map->elements = last_port - first_port + 1;
- map->memsize = map->elements * sizeof(unsigned long);
+ map->memsize = bitmap_bytes(0, map->elements);
set->variant = &bitmap_port;
set->dsize = ip_set_elem_len(set, tb, 0);
if (!init_map_port(set, map, first_port, last_port)) {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index dc9284bdd2dd..bac7e01df67f 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1182,10 +1182,12 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
static int
ip_set_dump_done(struct netlink_callback *cb)
{
- struct ip_set_net *inst = (struct ip_set_net *)cb->data;
- if (cb->args[2]) {
- pr_debug("release set %s\n", nfnl_set(inst, cb->args[1])->name);
- __ip_set_put_byindex(inst, (ip_set_id_t) cb->args[1]);
+ struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
+ if (cb->args[IPSET_CB_ARG0]) {
+ pr_debug("release set %s\n",
+ nfnl_set(inst, cb->args[IPSET_CB_INDEX])->name);
+ __ip_set_put_byindex(inst,
+ (ip_set_id_t) cb->args[IPSET_CB_INDEX]);
}
return 0;
}
@@ -1203,7 +1205,7 @@ dump_attrs(struct nlmsghdr *nlh)
}
static int
-dump_init(struct netlink_callback *cb)
+dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
{
struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
@@ -1211,15 +1213,15 @@ dump_init(struct netlink_callback *cb)
struct nlattr *attr = (void *)nlh + min_len;
u32 dump_type;
ip_set_id_t index;
- struct ip_set_net *inst = (struct ip_set_net *)cb->data;
/* Second pass, so parser can't fail */
nla_parse(cda, IPSET_ATTR_CMD_MAX,
attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
- /* cb->args[0] : dump single set/all sets
- * [1] : set index
- * [..]: type specific
+ /* cb->args[IPSET_CB_NET]: net namespace
+ * [IPSET_CB_DUMP]: dump single set/all sets
+ * [IPSET_CB_INDEX]: set index
+ * [IPSET_CB_ARG0]: type specific
*/
if (cda[IPSET_ATTR_SETNAME]) {
@@ -1231,7 +1233,7 @@ dump_init(struct netlink_callback *cb)
return -ENOENT;
dump_type = DUMP_ONE;
- cb->args[1] = index;
+ cb->args[IPSET_CB_INDEX] = index;
} else
dump_type = DUMP_ALL;
@@ -1239,7 +1241,8 @@ dump_init(struct netlink_callback *cb)
u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
dump_type |= (f << 16);
}
- cb->args[0] = dump_type;
+ cb->args[IPSET_CB_NET] = (unsigned long)inst;
+ cb->args[IPSET_CB_DUMP] = dump_type;
return 0;
}
@@ -1251,12 +1254,12 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
struct ip_set *set = NULL;
struct nlmsghdr *nlh = NULL;
unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
+ struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));
u32 dump_type, dump_flags;
int ret = 0;
- struct ip_set_net *inst = (struct ip_set_net *)cb->data;
- if (!cb->args[0]) {
- ret = dump_init(cb);
+ if (!cb->args[IPSET_CB_DUMP]) {
+ ret = dump_init(cb, inst);
if (ret < 0) {
nlh = nlmsg_hdr(cb->skb);
/* We have to create and send the error message
@@ -1267,17 +1270,18 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
}
}
- if (cb->args[1] >= inst->ip_set_max)
+ if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max)
goto out;
- dump_type = DUMP_TYPE(cb->args[0]);
- dump_flags = DUMP_FLAGS(cb->args[0]);
- max = dump_type == DUMP_ONE ? cb->args[1] + 1 : inst->ip_set_max;
+ dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]);
+ dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]);
+ max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1
+ : inst->ip_set_max;
dump_last:
- pr_debug("args[0]: %u %u args[1]: %ld\n",
- dump_type, dump_flags, cb->args[1]);
- for (; cb->args[1] < max; cb->args[1]++) {
- index = (ip_set_id_t) cb->args[1];
+ pr_debug("dump type, flag: %u %u index: %ld\n",
+ dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
+ for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
+ index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
set = nfnl_set(inst, index);
if (set == NULL) {
if (dump_type == DUMP_ONE) {
@@ -1294,7 +1298,7 @@ dump_last:
!!(set->type->features & IPSET_DUMP_LAST)))
continue;
pr_debug("List set: %s\n", set->name);
- if (!cb->args[2]) {
+ if (!cb->args[IPSET_CB_ARG0]) {
/* Start listing: make sure set won't be destroyed */
pr_debug("reference set\n");
__ip_set_get(set);
@@ -1311,7 +1315,7 @@ dump_last:
goto nla_put_failure;
if (dump_flags & IPSET_FLAG_LIST_SETNAME)
goto next_set;
- switch (cb->args[2]) {
+ switch (cb->args[IPSET_CB_ARG0]) {
case 0:
/* Core header data */
if (nla_put_string(skb, IPSET_ATTR_TYPENAME,
@@ -1331,7 +1335,7 @@ dump_last:
read_lock_bh(&set->lock);
ret = set->variant->list(set, skb, cb);
read_unlock_bh(&set->lock);
- if (!cb->args[2])
+ if (!cb->args[IPSET_CB_ARG0])
/* Set is done, proceed with next one */
goto next_set;
goto release_refcount;
@@ -1340,8 +1344,8 @@ dump_last:
/* If we dump all sets, continue with dumping last ones */
if (dump_type == DUMP_ALL) {
dump_type = DUMP_LAST;
- cb->args[0] = dump_type | (dump_flags << 16);
- cb->args[1] = 0;
+ cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16);
+ cb->args[IPSET_CB_INDEX] = 0;
goto dump_last;
}
goto out;
@@ -1350,15 +1354,15 @@ nla_put_failure:
ret = -EFAULT;
next_set:
if (dump_type == DUMP_ONE)
- cb->args[1] = IPSET_INVALID_ID;
+ cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID;
else
- cb->args[1]++;
+ cb->args[IPSET_CB_INDEX]++;
release_refcount:
/* If there was an error or set is done, release set */
- if (ret || !cb->args[2]) {
+ if (ret || !cb->args[IPSET_CB_ARG0]) {
pr_debug("release set %s\n", nfnl_set(inst, index)->name);
__ip_set_put_byindex(inst, index);
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
}
out:
if (nlh) {
@@ -1375,8 +1379,6 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
-
if (unlikely(protocol_failed(attr)))
return -IPSET_ERR_PROTOCOL;
@@ -1384,7 +1386,6 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
struct netlink_dump_control c = {
.dump = ip_set_dump_start,
.done = ip_set_dump_done,
- .data = (void *)inst
};
return netlink_dump_start(ctnl, skb, nlh, &c);
}
@@ -1961,7 +1962,6 @@ static int __net_init
ip_set_net_init(struct net *net)
{
struct ip_set_net *inst = ip_set_pernet(net);
-
struct ip_set **list;
inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX;
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 6a80dbd30df7..be6932ad3a86 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -234,7 +234,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
#define mtype MTYPE
-#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
#define mtype_add IPSET_TOKEN(MTYPE, _add)
#define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
@@ -931,7 +930,7 @@ mtype_list(const struct ip_set *set,
struct nlattr *atd, *nested;
const struct hbucket *n;
const struct mtype_elem *e;
- u32 first = cb->args[2];
+ u32 first = cb->args[IPSET_CB_ARG0];
/* We assume that one hash bucket fills into one page */
void *incomplete;
int i;
@@ -940,20 +939,22 @@ mtype_list(const struct ip_set *set,
if (!atd)
return -EMSGSIZE;
pr_debug("list hash set %s\n", set->name);
- for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
+ for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
+ cb->args[IPSET_CB_ARG0]++) {
incomplete = skb_tail_pointer(skb);
- n = hbucket(t, cb->args[2]);
- pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n);
+ n = hbucket(t, cb->args[IPSET_CB_ARG0]);
+ pr_debug("cb->arg bucket: %lu, t %p n %p\n",
+ cb->args[IPSET_CB_ARG0], t, n);
for (i = 0; i < n->pos; i++) {
e = ahash_data(n, i, set->dsize);
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
pr_debug("list hash %lu hbucket %p i %u, data %p\n",
- cb->args[2], n, i, e);
+ cb->args[IPSET_CB_ARG0], n, i, e);
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested) {
- if (cb->args[2] == first) {
+ if (cb->args[IPSET_CB_ARG0] == first) {
nla_nest_cancel(skb, atd);
return -EMSGSIZE;
} else
@@ -968,16 +969,16 @@ mtype_list(const struct ip_set *set,
}
ipset_nest_end(skb, atd);
/* Set listing finished */
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return 0;
nla_put_failure:
nlmsg_trim(skb, incomplete);
- if (unlikely(first == cb->args[2])) {
+ if (unlikely(first == cb->args[IPSET_CB_ARG0])) {
pr_warning("Can't list set %s: one bucket does not fit into "
"a message. Please report it!\n", set->name);
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return -EMSGSIZE;
}
ipset_nest_end(skb, atd);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 426032706ca9..2bc2dec20b00 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -137,12 +137,11 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netnet4_elem e = {
- .cidr[0] = h->nets[0].cidr[0] ? h->nets[0].cidr[0] : HOST_MASK,
- .cidr[1] = h->nets[0].cidr[1] ? h->nets[0].cidr[1] : HOST_MASK,
- };
+ struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
@@ -160,14 +159,14 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netnet4_elem e = { .cidr[0] = HOST_MASK,
- .cidr[1] = HOST_MASK };
+ struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, last;
u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
u8 cidr, cidr2;
int ret;
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
@@ -364,12 +363,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netnet6_elem e = {
- .cidr[0] = h->nets[0].cidr[0] ? h->nets[0].cidr[0] : HOST_MASK,
- .cidr[1] = h->nets[0].cidr[1] ? h->nets[0].cidr[1] : HOST_MASK
- };
+ struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
@@ -386,11 +384,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netnet6_elem e = { .cidr[0] = HOST_MASK,
- .cidr[1] = HOST_MASK };
+ struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 363fab933d48..703d1192a6a2 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -147,12 +147,11 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netportnet4_elem e = {
- .cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
- .cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK),
- };
+ struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
@@ -174,8 +173,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netportnet4_elem e = { .cidr[0] = HOST_MASK,
- .cidr[1] = HOST_MASK };
+ struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
@@ -183,6 +181,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr, cidr2;
int ret;
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
@@ -419,12 +418,11 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netportnet6_elem e = {
- .cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
- .cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK),
- };
+ struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
@@ -446,13 +444,13 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netportnet6_elem e = { .cidr[0] = HOST_MASK,
- .cidr[1] = HOST_MASK };
+ struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 port, port_to;
bool with_ports = false;
int ret;
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index ec6f6d15dded..3e2317f3cf68 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -490,14 +490,15 @@ list_set_list(const struct ip_set *set,
{
const struct list_set *map = set->data;
struct nlattr *atd, *nested;
- u32 i, first = cb->args[2];
+ u32 i, first = cb->args[IPSET_CB_ARG0];
const struct set_elem *e;
atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
- for (; cb->args[2] < map->size; cb->args[2]++) {
- i = cb->args[2];
+ for (; cb->args[IPSET_CB_ARG0] < map->size;
+ cb->args[IPSET_CB_ARG0]++) {
+ i = cb->args[IPSET_CB_ARG0];
e = list_set_elem(set, map, i);
if (e->id == IPSET_INVALID_ID)
goto finish;
@@ -522,13 +523,13 @@ list_set_list(const struct ip_set *set,
finish:
ipset_nest_end(skb, atd);
/* Set listing finished */
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return 0;
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(i == first)) {
- cb->args[2] = 0;
+ cb->args[IPSET_CB_ARG0] = 0;
return -EMSGSIZE;
}
ipset_nest_end(skb, atd);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a3df9bddc4f7..62786a495cea 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -704,7 +704,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
__ip_vs_dst_cache_reset(dest);
__ip_vs_svc_put(svc, false);
free_percpu(dest->stats.cpustats);
- kfree(dest);
+ ip_vs_dest_put_and_free(dest);
}
/*
@@ -3820,10 +3820,6 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- /* Some dest can be in grace period even before cleanup, we have to
- * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called.
- */
- rcu_barrier();
ip_vs_trash_cleanup(net);
ip_vs_stop_estimator(net, &ipvs->tot_stats);
ip_vs_control_net_cleanup_sysctl(net);
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index eff13c94498e..ca056a331e60 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -136,7 +136,7 @@ static void ip_vs_lblc_rcu_free(struct rcu_head *head)
struct ip_vs_lblc_entry,
rcu_head);
- ip_vs_dest_put(en->dest);
+ ip_vs_dest_put_and_free(en->dest);
kfree(en);
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 0b8550089a2e..3f21a2f47de1 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -130,7 +130,7 @@ static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)
struct ip_vs_dest_set_elem *e;
e = container_of(head, struct ip_vs_dest_set_elem, rcu_head);
- ip_vs_dest_put(e->dest);
+ ip_vs_dest_put_and_free(e->dest);
kfree(e);
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 23e596e438b3..2f7ea7564044 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -20,13 +20,18 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
sctp_sctphdr_t *sh, _sctph;
sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
- if (sh == NULL)
+ if (sh == NULL) {
+ *verdict = NF_DROP;
return 0;
+ }
sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
sizeof(_schunkh), &_schunkh);
- if (sch == NULL)
+ if (sch == NULL) {
+ *verdict = NF_DROP;
return 0;
+ }
+
net = skb_net(skb);
ipvs = net_ipvs(net);
rcu_read_lock();
@@ -76,6 +81,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
{
sctp_sctphdr_t *sctph;
unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6 && iph->fragoffs)
@@ -87,19 +93,31 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_out(cp, skb))
+ ret = ip_vs_app_pkt_out(cp, skb);
+ if (ret == 0)
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
}
sctph = (void *) skb_network_header(skb) + sctphoff;
- sctph->source = cp->vport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ /* Only update csum if we really have to */
+ if (sctph->source != cp->vport || payload_csum ||
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ sctph->source = cp->vport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
return 1;
}
@@ -110,6 +128,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
{
sctp_sctphdr_t *sctph;
unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6 && iph->fragoffs)
@@ -121,19 +140,32 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_in(cp, skb))
+ ret = ip_vs_app_pkt_in(cp, skb);
+ if (ret == 0)
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
}
sctph = (void *) skb_network_header(skb) + sctphoff;
- sctph->dest = cp->dport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ /* Only update csum if we really have to */
+ if (sctph->dest != cp->dport || payload_csum ||
+ (skb->ip_summed == CHECKSUM_PARTIAL &&
+ !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) {
+ sctph->dest = cp->dport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
return 1;
}
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 3588faebe529..cc65b2f42cd4 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -115,27 +115,46 @@ ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
}
-/* As ip_vs_sh_get, but with fallback if selected server is unavailable */
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable
+ *
+ * The fallback strategy loops around the table starting from a "random"
+ * point (in fact, it is chosen to be the original hash value to make the
+ * algorithm deterministic) to find a new server.
+ */
static inline struct ip_vs_dest *
ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
const union nf_inet_addr *addr, __be16 port)
{
- unsigned int offset;
- unsigned int hash;
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
struct ip_vs_dest *dest;
+ /* first try the dest it's supposed to go to */
+ ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ dest = rcu_dereference(s->buckets[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
+
+ /* if the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
- hash = ip_vs_sh_hashkey(svc->af, addr, port, offset);
+ roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE;
+ hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);
dest = rcu_dereference(s->buckets[hash].dest);
if (!dest)
break;
- if (is_unavailable(dest))
- IP_VS_DBG_BUF(6, "SH: selected unavailable server "
- "%s:%d (offset %d)",
- IP_VS_DBG_ADDR(svc->af, &dest->addr),
- ntohs(dest->port), offset);
- else
+ if (!is_unavailable(dest))
return dest;
+ IP_VS_DBG_BUF(6, "SH: selected unavailable "
+ "server %s:%d (offset %d), reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ ntohs(dest->port), roffset);
}
return NULL;
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 2d3030ab5b61..a4b5e2a435ac 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -39,21 +39,23 @@ static struct ctl_table acct_sysctl_table[] = {
unsigned int
seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
{
- struct nf_conn_counter *acct;
+ struct nf_conn_acct *acct;
+ struct nf_conn_counter *counter;
acct = nf_conn_acct_find(ct);
if (!acct)
return 0;
+ counter = acct->counter;
return seq_printf(s, "packets=%llu bytes=%llu ",
- (unsigned long long)atomic64_read(&acct[dir].packets),
- (unsigned long long)atomic64_read(&acct[dir].bytes));
+ (unsigned long long)atomic64_read(&counter[dir].packets),
+ (unsigned long long)atomic64_read(&counter[dir].bytes));
};
EXPORT_SYMBOL_GPL(seq_print_acct);
static struct nf_ct_ext_type acct_extend __read_mostly = {
- .len = sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]),
- .align = __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]),
+ .len = sizeof(struct nf_conn_acct),
+ .align = __alignof__(struct nf_conn_acct),
.id = NF_CT_EXT_ACCT,
};
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5d892febd64c..e22d950c60b3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1109,12 +1109,14 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
acct:
if (do_acct) {
- struct nf_conn_counter *acct;
+ struct nf_conn_acct *acct;
acct = nf_conn_acct_find(ct);
if (acct) {
- atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(skb->len, &acct[CTINFO2DIR(ctinfo)].bytes);
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
}
}
}
@@ -1126,13 +1128,15 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
int do_acct)
{
if (do_acct) {
- struct nf_conn_counter *acct;
+ struct nf_conn_acct *acct;
acct = nf_conn_acct_find(ct);
if (acct) {
- atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets);
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
atomic64_add(skb->len - skb_network_offset(skb),
- &acct[CTINFO2DIR(ctinfo)].bytes);
+ &counter[CTINFO2DIR(ctinfo)].bytes);
}
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index eea936b70d15..08870b859046 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -211,13 +211,23 @@ nla_put_failure:
}
static int
-dump_counters(struct sk_buff *skb, u64 pkts, u64 bytes,
- enum ip_conntrack_dir dir)
+dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct,
+ enum ip_conntrack_dir dir, int type)
{
- enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ struct nf_conn_counter *counter = acct->counter;
struct nlattr *nest_count;
+ u64 pkts, bytes;
- nest_count = nla_nest_start(skb, type | NLA_F_NESTED);
+ if (type == IPCTNL_MSG_CT_GET_CTRZERO) {
+ pkts = atomic64_xchg(&counter[dir].packets, 0);
+ bytes = atomic64_xchg(&counter[dir].bytes, 0);
+ } else {
+ pkts = atomic64_read(&counter[dir].packets);
+ bytes = atomic64_read(&counter[dir].bytes);
+ }
+
+ nest_count = nla_nest_start(skb, attr | NLA_F_NESTED);
if (!nest_count)
goto nla_put_failure;
@@ -234,24 +244,19 @@ nla_put_failure:
}
static int
-ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct,
- enum ip_conntrack_dir dir, int type)
+ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type)
{
- struct nf_conn_counter *acct;
- u64 pkts, bytes;
+ struct nf_conn_acct *acct = nf_conn_acct_find(ct);
- acct = nf_conn_acct_find(ct);
if (!acct)
return 0;
- if (type == IPCTNL_MSG_CT_GET_CTRZERO) {
- pkts = atomic64_xchg(&acct[dir].packets, 0);
- bytes = atomic64_xchg(&acct[dir].bytes, 0);
- } else {
- pkts = atomic64_read(&acct[dir].packets);
- bytes = atomic64_read(&acct[dir].bytes);
- }
- return dump_counters(skb, pkts, bytes, dir);
+ if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0)
+ return -1;
+ if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0)
+ return -1;
+
+ return 0;
}
static int
@@ -488,8 +493,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
if (ctnetlink_dump_status(skb, ct) < 0 ||
ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, type) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, type) < 0 ||
+ ctnetlink_dump_acct(skb, ct, type) < 0 ||
ctnetlink_dump_timestamp(skb, ct) < 0 ||
ctnetlink_dump_protoinfo(skb, ct) < 0 ||
ctnetlink_dump_helpinfo(skb, ct) < 0 ||
@@ -530,7 +534,7 @@ ctnetlink_proto_size(const struct nf_conn *ct)
}
static inline size_t
-ctnetlink_counters_size(const struct nf_conn *ct)
+ctnetlink_acct_size(const struct nf_conn *ct)
{
if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
return 0;
@@ -579,7 +583,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
- + ctnetlink_counters_size(ct)
+ + ctnetlink_acct_size(ct)
+ ctnetlink_timestamp_size(ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ nla_total_size(0) /* CTA_PROTOINFO */
@@ -673,10 +677,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
if (events & (1 << IPCT_DESTROY)) {
- if (ctnetlink_dump_counters(skb, ct,
- IP_CT_DIR_ORIGINAL, type) < 0 ||
- ctnetlink_dump_counters(skb, ct,
- IP_CT_DIR_REPLY, type) < 0 ||
+ if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
ctnetlink_dump_timestamp(skb, ct) < 0)
goto nla_put_failure;
} else {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 4811f762e060..a82667c64729 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -634,9 +634,9 @@ nft_match_select_ops(const struct nft_ctx *ctx,
static void nft_match_release(void)
{
- struct nft_xt *nft_match;
+ struct nft_xt *nft_match, *tmp;
- list_for_each_entry(nft_match, &nft_match_list, head)
+ list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head)
kfree(nft_match);
}
@@ -705,9 +705,9 @@ nft_target_select_ops(const struct nft_ctx *ctx,
static void nft_target_release(void)
{
- struct nft_xt *nft_target;
+ struct nft_xt *nft_target, *tmp;
- list_for_each_entry(nft_target, &nft_target_list, head)
+ list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head)
kfree(nft_target);
}
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index b0b87b2d2411..d3b1ffe26181 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -47,8 +47,10 @@ static void nft_nat_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
if (priv->sreg_addr_min) {
if (priv->family == AF_INET) {
- range.min_addr.ip = data[priv->sreg_addr_min].data[0];
- range.max_addr.ip = data[priv->sreg_addr_max].data[0];
+ range.min_addr.ip = (__force __be32)
+ data[priv->sreg_addr_min].data[0];
+ range.max_addr.ip = (__force __be32)
+ data[priv->sreg_addr_max].data[0];
} else {
memcpy(range.min_addr.ip6,
@@ -62,8 +64,10 @@ static void nft_nat_eval(const struct nft_expr *expr,
}
if (priv->sreg_proto_min) {
- range.min_proto.all = data[priv->sreg_proto_min].data[0];
- range.max_proto.all = data[priv->sreg_proto_max].data[0];
+ range.min_proto.all = (__force __be16)
+ data[priv->sreg_proto_min].data[0];
+ range.max_proto.all = (__force __be16)
+ data[priv->sreg_proto_max].data[0];
range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8b03028cca69..227aa11e8409 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -845,8 +845,13 @@ xt_replace_table(struct xt_table *table,
return NULL;
}
- table->private = newinfo;
newinfo->initial_entries = private->initial_entries;
+ /*
+ * Ensure contents of newinfo are visible before assigning to
+ * private.
+ */
+ smp_wmb();
+ table->private = newinfo;
/*
* Even though table entries have now been swapped, other CPU's
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 1e2fae32f81b..ed00fef58996 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -147,6 +147,7 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_NFQ_info_v3 *info = par->targinfo;
u32 queue = info->queuenum;
+ int ret;
if (info->queues_total > 1) {
if (info->flags & NFQ_FLAG_CPU_FANOUT) {
@@ -157,7 +158,11 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
queue = nfqueue_hash(skb, par);
}
- return NF_QUEUE_NR(queue);
+ ret = NF_QUEUE_NR(queue);
+ if (info->flags & NFQ_FLAG_BYPASS)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+ return ret;
}
static struct xt_target nfqueue_tg_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index e595e07a759b..1e634615ab9d 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -26,16 +26,18 @@ connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)
u_int64_t what = 0; /* initialize to make gcc happy */
u_int64_t bytes = 0;
u_int64_t pkts = 0;
+ const struct nf_conn_acct *acct;
const struct nf_conn_counter *counters;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return false;
- counters = nf_conn_acct_find(ct);
- if (!counters)
+ acct = nf_conn_acct_find(ct);
+ if (!acct)
return false;
+ counters = acct->counter;
switch (sinfo->what) {
case XT_CONNBYTES_PKTS:
switch (sinfo->direction) {
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 3dd0e374bc2b..1ba67931eb1b 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -35,15 +35,6 @@
#include <net/netfilter/nf_conntrack.h>
#endif
-static void
-xt_socket_put_sk(struct sock *sk)
-{
- if (sk->sk_state == TCP_TIME_WAIT)
- inet_twsk_put(inet_twsk(sk));
- else
- sock_put(sk);
-}
-
static int
extract_icmp4_fields(const struct sk_buff *skb,
u8 *protocol,
@@ -216,7 +207,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
inet_twsk(sk)->tw_transparent));
if (sk != skb->sk)
- xt_socket_put_sk(sk);
+ sock_gen_put(sk);
if (wildcard || !transparent)
sk = NULL;
@@ -381,7 +372,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
inet_twsk(sk)->tw_transparent));
if (sk != skb->sk)
- xt_socket_put_sk(sk);
+ sock_gen_put(sk);
if (wildcard || !transparent)
sk = NULL;
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index ea36e99089af..3591cb5dae91 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -9,6 +9,8 @@ openvswitch-y := \
datapath.o \
dp_notify.o \
flow.o \
+ flow_netlink.o \
+ flow_table.o \
vport.o \
vport-internal_dev.o \
vport-netdev.o
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2aa13bd7f2b2..1408adc2a2a7 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -55,14 +55,10 @@
#include "datapath.h"
#include "flow.h"
+#include "flow_netlink.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
-
-#define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
-static void rehash_flow_table(struct work_struct *work);
-static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
-
int ovs_net_id __read_mostly;
static void ovs_notify(struct sk_buff *skb, struct genl_info *info,
@@ -165,7 +161,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
{
struct datapath *dp = container_of(rcu, struct datapath, rcu);
- ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false);
+ ovs_flow_tbl_destroy(&dp->table);
free_percpu(dp->stats_percpu);
release_net(ovs_dp_get_net(dp));
kfree(dp->ports);
@@ -225,6 +221,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
struct dp_stats_percpu *stats;
struct sw_flow_key key;
u64 *stats_counter;
+ u32 n_mask_hit;
int error;
stats = this_cpu_ptr(dp->stats_percpu);
@@ -237,7 +234,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
}
/* Look up flow. */
- flow = ovs_flow_lookup(rcu_dereference(dp->table), &key);
+ flow = ovs_flow_tbl_lookup(&dp->table, &key, &n_mask_hit);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
@@ -262,6 +259,7 @@ out:
/* Update datapath statistics. */
u64_stats_update_begin(&stats->sync);
(*stats_counter)++;
+ stats->n_mask_hit += n_mask_hit;
u64_stats_update_end(&stats->sync);
}
@@ -435,7 +433,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
upcall->dp_ifindex = dp_ifindex;
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
- ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb);
+ ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb);
nla_nest_end(user_skb, nla);
if (upcall_info->userdata)
@@ -455,398 +453,6 @@ out:
return err;
}
-/* Called with ovs_mutex. */
-static int flush_flows(struct datapath *dp)
-{
- struct flow_table *old_table;
- struct flow_table *new_table;
-
- old_table = ovsl_dereference(dp->table);
- new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
- if (!new_table)
- return -ENOMEM;
-
- rcu_assign_pointer(dp->table, new_table);
-
- ovs_flow_tbl_destroy(old_table, true);
- return 0;
-}
-
-static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, int attr_len)
-{
-
- struct sw_flow_actions *acts;
- int new_acts_size;
- int req_size = NLA_ALIGN(attr_len);
- int next_offset = offsetof(struct sw_flow_actions, actions) +
- (*sfa)->actions_len;
-
- if (req_size <= (ksize(*sfa) - next_offset))
- goto out;
-
- new_acts_size = ksize(*sfa) * 2;
-
- if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
- if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
- return ERR_PTR(-EMSGSIZE);
- new_acts_size = MAX_ACTIONS_BUFSIZE;
- }
-
- acts = ovs_flow_actions_alloc(new_acts_size);
- if (IS_ERR(acts))
- return (void *)acts;
-
- memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
- acts->actions_len = (*sfa)->actions_len;
- kfree(*sfa);
- *sfa = acts;
-
-out:
- (*sfa)->actions_len += req_size;
- return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
-}
-
-static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
-{
- struct nlattr *a;
-
- a = reserve_sfa_size(sfa, nla_attr_size(len));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- a->nla_type = attrtype;
- a->nla_len = nla_attr_size(len);
-
- if (data)
- memcpy(nla_data(a), data, len);
- memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
-
- return 0;
-}
-
-static inline int add_nested_action_start(struct sw_flow_actions **sfa, int attrtype)
-{
- int used = (*sfa)->actions_len;
- int err;
-
- err = add_action(sfa, attrtype, NULL, 0);
- if (err)
- return err;
-
- return used;
-}
-
-static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_offset)
-{
- struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + st_offset);
-
- a->nla_len = sfa->actions_len - st_offset;
-}
-
-static int validate_and_copy_actions(const struct nlattr *attr,
- const struct sw_flow_key *key, int depth,
- struct sw_flow_actions **sfa);
-
-static int validate_and_copy_sample(const struct nlattr *attr,
- const struct sw_flow_key *key, int depth,
- struct sw_flow_actions **sfa)
-{
- const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
- const struct nlattr *probability, *actions;
- const struct nlattr *a;
- int rem, start, err, st_acts;
-
- memset(attrs, 0, sizeof(attrs));
- nla_for_each_nested(a, attr, rem) {
- int type = nla_type(a);
- if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
- return -EINVAL;
- attrs[type] = a;
- }
- if (rem)
- return -EINVAL;
-
- probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
- if (!probability || nla_len(probability) != sizeof(u32))
- return -EINVAL;
-
- actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
- if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
- return -EINVAL;
-
- /* validation done, copy sample action. */
- start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE);
- if (start < 0)
- return start;
- err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, nla_data(probability), sizeof(u32));
- if (err)
- return err;
- st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS);
- if (st_acts < 0)
- return st_acts;
-
- err = validate_and_copy_actions(actions, key, depth + 1, sfa);
- if (err)
- return err;
-
- add_nested_action_end(*sfa, st_acts);
- add_nested_action_end(*sfa, start);
-
- return 0;
-}
-
-static int validate_tp_port(const struct sw_flow_key *flow_key)
-{
- if (flow_key->eth.type == htons(ETH_P_IP)) {
- if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst)
- return 0;
- } else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
- if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst)
- return 0;
- }
-
- return -EINVAL;
-}
-
-static int validate_and_copy_set_tun(const struct nlattr *attr,
- struct sw_flow_actions **sfa)
-{
- struct sw_flow_match match;
- struct sw_flow_key key;
- int err, start;
-
- ovs_match_init(&match, &key, NULL);
- err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false);
- if (err)
- return err;
-
- start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
- if (start < 0)
- return start;
-
- err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
- sizeof(match.key->tun_key));
- add_nested_action_end(*sfa, start);
-
- return err;
-}
-
-static int validate_set(const struct nlattr *a,
- const struct sw_flow_key *flow_key,
- struct sw_flow_actions **sfa,
- bool *set_tun)
-{
- const struct nlattr *ovs_key = nla_data(a);
- int key_type = nla_type(ovs_key);
-
- /* There can be only one key in a action */
- if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
- return -EINVAL;
-
- if (key_type > OVS_KEY_ATTR_MAX ||
- (ovs_key_lens[key_type] != nla_len(ovs_key) &&
- ovs_key_lens[key_type] != -1))
- return -EINVAL;
-
- switch (key_type) {
- const struct ovs_key_ipv4 *ipv4_key;
- const struct ovs_key_ipv6 *ipv6_key;
- int err;
-
- case OVS_KEY_ATTR_PRIORITY:
- case OVS_KEY_ATTR_SKB_MARK:
- case OVS_KEY_ATTR_ETHERNET:
- break;
-
- case OVS_KEY_ATTR_TUNNEL:
- *set_tun = true;
- err = validate_and_copy_set_tun(a, sfa);
- if (err)
- return err;
- break;
-
- case OVS_KEY_ATTR_IPV4:
- if (flow_key->eth.type != htons(ETH_P_IP))
- return -EINVAL;
-
- if (!flow_key->ip.proto)
- return -EINVAL;
-
- ipv4_key = nla_data(ovs_key);
- if (ipv4_key->ipv4_proto != flow_key->ip.proto)
- return -EINVAL;
-
- if (ipv4_key->ipv4_frag != flow_key->ip.frag)
- return -EINVAL;
-
- break;
-
- case OVS_KEY_ATTR_IPV6:
- if (flow_key->eth.type != htons(ETH_P_IPV6))
- return -EINVAL;
-
- if (!flow_key->ip.proto)
- return -EINVAL;
-
- ipv6_key = nla_data(ovs_key);
- if (ipv6_key->ipv6_proto != flow_key->ip.proto)
- return -EINVAL;
-
- if (ipv6_key->ipv6_frag != flow_key->ip.frag)
- return -EINVAL;
-
- if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
- return -EINVAL;
-
- break;
-
- case OVS_KEY_ATTR_TCP:
- if (flow_key->ip.proto != IPPROTO_TCP)
- return -EINVAL;
-
- return validate_tp_port(flow_key);
-
- case OVS_KEY_ATTR_UDP:
- if (flow_key->ip.proto != IPPROTO_UDP)
- return -EINVAL;
-
- return validate_tp_port(flow_key);
-
- case OVS_KEY_ATTR_SCTP:
- if (flow_key->ip.proto != IPPROTO_SCTP)
- return -EINVAL;
-
- return validate_tp_port(flow_key);
-
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int validate_userspace(const struct nlattr *attr)
-{
- static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
- [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
- [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
- };
- struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
- int error;
-
- error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
- attr, userspace_policy);
- if (error)
- return error;
-
- if (!a[OVS_USERSPACE_ATTR_PID] ||
- !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
- return -EINVAL;
-
- return 0;
-}
-
-static int copy_action(const struct nlattr *from,
- struct sw_flow_actions **sfa)
-{
- int totlen = NLA_ALIGN(from->nla_len);
- struct nlattr *to;
-
- to = reserve_sfa_size(sfa, from->nla_len);
- if (IS_ERR(to))
- return PTR_ERR(to);
-
- memcpy(to, from, totlen);
- return 0;
-}
-
-static int validate_and_copy_actions(const struct nlattr *attr,
- const struct sw_flow_key *key,
- int depth,
- struct sw_flow_actions **sfa)
-{
- const struct nlattr *a;
- int rem, err;
-
- if (depth >= SAMPLE_ACTION_DEPTH)
- return -EOVERFLOW;
-
- nla_for_each_nested(a, attr, rem) {
- /* Expected argument lengths, (u32)-1 for variable length. */
- static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
- [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
- [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
- [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
- [OVS_ACTION_ATTR_POP_VLAN] = 0,
- [OVS_ACTION_ATTR_SET] = (u32)-1,
- [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
- };
- const struct ovs_action_push_vlan *vlan;
- int type = nla_type(a);
- bool skip_copy;
-
- if (type > OVS_ACTION_ATTR_MAX ||
- (action_lens[type] != nla_len(a) &&
- action_lens[type] != (u32)-1))
- return -EINVAL;
-
- skip_copy = false;
- switch (type) {
- case OVS_ACTION_ATTR_UNSPEC:
- return -EINVAL;
-
- case OVS_ACTION_ATTR_USERSPACE:
- err = validate_userspace(a);
- if (err)
- return err;
- break;
-
- case OVS_ACTION_ATTR_OUTPUT:
- if (nla_get_u32(a) >= DP_MAX_PORTS)
- return -EINVAL;
- break;
-
-
- case OVS_ACTION_ATTR_POP_VLAN:
- break;
-
- case OVS_ACTION_ATTR_PUSH_VLAN:
- vlan = nla_data(a);
- if (vlan->vlan_tpid != htons(ETH_P_8021Q))
- return -EINVAL;
- if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
- return -EINVAL;
- break;
-
- case OVS_ACTION_ATTR_SET:
- err = validate_set(a, key, sfa, &skip_copy);
- if (err)
- return err;
- break;
-
- case OVS_ACTION_ATTR_SAMPLE:
- err = validate_and_copy_sample(a, key, depth, sfa);
- if (err)
- return err;
- skip_copy = true;
- break;
-
- default:
- return -EINVAL;
- }
- if (!skip_copy) {
- err = copy_action(a, sfa);
- if (err)
- return err;
- }
- }
-
- if (rem > 0)
- return -EINVAL;
-
- return 0;
-}
-
static void clear_stats(struct sw_flow *flow)
{
flow->used = 0;
@@ -902,15 +508,16 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_flow_free;
- err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]);
+ err = ovs_nla_get_flow_metadata(flow, a[OVS_PACKET_ATTR_KEY]);
if (err)
goto err_flow_free;
- acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
+ acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
err = PTR_ERR(acts);
if (IS_ERR(acts))
goto err_flow_free;
- err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts);
+ err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
+ &flow->key, 0, &acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (err)
goto err_flow_free;
@@ -958,15 +565,18 @@ static struct genl_ops dp_packet_genl_ops[] = {
}
};
-static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
+static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats,
+ struct ovs_dp_megaflow_stats *mega_stats)
{
- struct flow_table *table;
int i;
- table = rcu_dereference_check(dp->table, lockdep_ovsl_is_held());
- stats->n_flows = ovs_flow_tbl_count(table);
+ memset(mega_stats, 0, sizeof(*mega_stats));
+
+ stats->n_flows = ovs_flow_tbl_count(&dp->table);
+ mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
stats->n_hit = stats->n_missed = stats->n_lost = 0;
+
for_each_possible_cpu(i) {
const struct dp_stats_percpu *percpu_stats;
struct dp_stats_percpu local_stats;
@@ -982,6 +592,7 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
stats->n_hit += local_stats.n_hit;
stats->n_missed += local_stats.n_missed;
stats->n_lost += local_stats.n_lost;
+ mega_stats->n_mask_hit += local_stats.n_mask_hit;
}
}
@@ -1005,100 +616,6 @@ static struct genl_multicast_group ovs_dp_flow_multicast_group = {
.name = OVS_FLOW_MCGROUP
};
-static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb);
-static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
-{
- const struct nlattr *a;
- struct nlattr *start;
- int err = 0, rem;
-
- start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
- if (!start)
- return -EMSGSIZE;
-
- nla_for_each_nested(a, attr, rem) {
- int type = nla_type(a);
- struct nlattr *st_sample;
-
- switch (type) {
- case OVS_SAMPLE_ATTR_PROBABILITY:
- if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, sizeof(u32), nla_data(a)))
- return -EMSGSIZE;
- break;
- case OVS_SAMPLE_ATTR_ACTIONS:
- st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
- if (!st_sample)
- return -EMSGSIZE;
- err = actions_to_attr(nla_data(a), nla_len(a), skb);
- if (err)
- return err;
- nla_nest_end(skb, st_sample);
- break;
- }
- }
-
- nla_nest_end(skb, start);
- return err;
-}
-
-static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
-{
- const struct nlattr *ovs_key = nla_data(a);
- int key_type = nla_type(ovs_key);
- struct nlattr *start;
- int err;
-
- switch (key_type) {
- case OVS_KEY_ATTR_IPV4_TUNNEL:
- start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
- if (!start)
- return -EMSGSIZE;
-
- err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
- nla_data(ovs_key));
- if (err)
- return err;
- nla_nest_end(skb, start);
- break;
- default:
- if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
- return -EMSGSIZE;
- break;
- }
-
- return 0;
-}
-
-static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb)
-{
- const struct nlattr *a;
- int rem, err;
-
- nla_for_each_attr(a, attr, len, rem) {
- int type = nla_type(a);
-
- switch (type) {
- case OVS_ACTION_ATTR_SET:
- err = set_action_to_attr(a, skb);
- if (err)
- return err;
- break;
-
- case OVS_ACTION_ATTR_SAMPLE:
- err = sample_action_to_attr(a, skb);
- if (err)
- return err;
- break;
- default:
- if (nla_put(skb, type, nla_len(a), nla_data(a)))
- return -EMSGSIZE;
- break;
- }
- }
-
- return 0;
-}
-
static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
{
return NLMSG_ALIGN(sizeof(struct ovs_header))
@@ -1135,8 +652,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
if (!nla)
goto nla_put_failure;
- err = ovs_flow_to_nlattrs(&flow->unmasked_key,
- &flow->unmasked_key, skb);
+ err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb);
if (err)
goto error;
nla_nest_end(skb, nla);
@@ -1145,7 +661,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
if (!nla)
goto nla_put_failure;
- err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb);
+ err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb);
if (err)
goto error;
@@ -1155,7 +671,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
used = flow->used;
stats.n_packets = flow->packet_count;
stats.n_bytes = flow->byte_count;
- tcp_flags = flow->tcp_flags;
+ tcp_flags = (u8)ntohs(flow->tcp_flags);
spin_unlock_bh(&flow->lock);
if (used &&
@@ -1188,7 +704,8 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
sf_acts = rcu_dereference_check(flow->sf_acts,
lockdep_ovsl_is_held());
- err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb);
+ err = ovs_nla_put_actions(sf_acts->actions,
+ sf_acts->actions_len, skb);
if (!err)
nla_nest_end(skb, start);
else {
@@ -1234,6 +751,14 @@ static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
return skb;
}
+static struct sw_flow *__ovs_flow_tbl_lookup(struct flow_table *tbl,
+ const struct sw_flow_key *key)
+{
+ u32 __always_unused n_mask_hit;
+
+ return ovs_flow_tbl_lookup(tbl, key, &n_mask_hit);
+}
+
static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
@@ -1243,7 +768,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
- struct flow_table *table;
struct sw_flow_actions *acts = NULL;
struct sw_flow_match match;
int error;
@@ -1254,21 +778,21 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
goto error;
ovs_match_init(&match, &key, &mask);
- error = ovs_match_from_nlattrs(&match,
- a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
+ error = ovs_nla_get_match(&match,
+ a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
if (error)
goto error;
/* Validate actions. */
if (a[OVS_FLOW_ATTR_ACTIONS]) {
- acts = ovs_flow_actions_alloc(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
+ acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
error = PTR_ERR(acts);
if (IS_ERR(acts))
goto error;
- ovs_flow_key_mask(&masked_key, &key, &mask);
- error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
- &masked_key, 0, &acts);
+ ovs_flow_mask_key(&masked_key, &key, &mask);
+ error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
+ &masked_key, 0, &acts);
if (error) {
OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
goto err_kfree;
@@ -1284,29 +808,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
if (!dp)
goto err_unlock_ovs;
- table = ovsl_dereference(dp->table);
-
/* Check if this is a duplicate flow */
- flow = ovs_flow_lookup(table, &key);
+ flow = __ovs_flow_tbl_lookup(&dp->table, &key);
if (!flow) {
- struct sw_flow_mask *mask_p;
/* Bail out if we're not allowed to create a new flow. */
error = -ENOENT;
if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
goto err_unlock_ovs;
- /* Expand table, if necessary, to make room. */
- if (ovs_flow_tbl_need_to_expand(table)) {
- struct flow_table *new_table;
-
- new_table = ovs_flow_tbl_expand(table);
- if (!IS_ERR(new_table)) {
- rcu_assign_pointer(dp->table, new_table);
- ovs_flow_tbl_destroy(table, true);
- table = ovsl_dereference(dp->table);
- }
- }
-
/* Allocate flow. */
flow = ovs_flow_alloc();
if (IS_ERR(flow)) {
@@ -1317,25 +826,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
flow->key = masked_key;
flow->unmasked_key = key;
-
- /* Make sure mask is unique in the system */
- mask_p = ovs_sw_flow_mask_find(table, &mask);
- if (!mask_p) {
- /* Allocate a new mask if none exsits. */
- mask_p = ovs_sw_flow_mask_alloc();
- if (!mask_p)
- goto err_flow_free;
- mask_p->key = mask.key;
- mask_p->range = mask.range;
- ovs_sw_flow_mask_insert(table, mask_p);
- }
-
- ovs_sw_flow_mask_add_ref(mask_p);
- flow->mask = mask_p;
rcu_assign_pointer(flow->sf_acts, acts);
/* Put flow in bucket. */
- ovs_flow_insert(table, flow);
+ error = ovs_flow_tbl_insert(&dp->table, flow, &mask);
+ if (error) {
+ acts = NULL;
+ goto err_flow_free;
+ }
reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
info->snd_seq, OVS_FLOW_CMD_NEW);
@@ -1356,7 +854,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
/* The unmasked key has to be the same for flow updates. */
error = -EINVAL;
- if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) {
+ if (!ovs_flow_cmp_unmasked_key(flow, &match)) {
OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n");
goto err_unlock_ovs;
}
@@ -1364,7 +862,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
/* Update actions. */
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
- ovs_flow_deferred_free_acts(old_acts);
+ ovs_nla_free_flow_actions(old_acts);
reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
info->snd_seq, OVS_FLOW_CMD_NEW);
@@ -1403,7 +901,6 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct sw_flow *flow;
struct datapath *dp;
- struct flow_table *table;
struct sw_flow_match match;
int err;
@@ -1413,7 +910,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
}
ovs_match_init(&match, &key, NULL);
- err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
+ err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
if (err)
return err;
@@ -1424,9 +921,8 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
goto unlock;
}
- table = ovsl_dereference(dp->table);
- flow = ovs_flow_lookup_unmasked_key(table, &match);
- if (!flow) {
+ flow = __ovs_flow_tbl_lookup(&dp->table, &key);
+ if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) {
err = -ENOENT;
goto unlock;
}
@@ -1453,7 +949,6 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct sw_flow *flow;
struct datapath *dp;
- struct flow_table *table;
struct sw_flow_match match;
int err;
@@ -1465,18 +960,17 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
}
if (!a[OVS_FLOW_ATTR_KEY]) {
- err = flush_flows(dp);
+ err = ovs_flow_tbl_flush(&dp->table);
goto unlock;
}
ovs_match_init(&match, &key, NULL);
- err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
+ err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
if (err)
goto unlock;
- table = ovsl_dereference(dp->table);
- flow = ovs_flow_lookup_unmasked_key(table, &match);
- if (!flow) {
+ flow = __ovs_flow_tbl_lookup(&dp->table, &key);
+ if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) {
err = -ENOENT;
goto unlock;
}
@@ -1487,7 +981,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
goto unlock;
}
- ovs_flow_remove(table, flow);
+ ovs_flow_tbl_remove(&dp->table, flow);
err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_FLOW_CMD_DEL);
@@ -1506,8 +1000,8 @@ unlock:
static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
+ struct table_instance *ti;
struct datapath *dp;
- struct flow_table *table;
rcu_read_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
@@ -1516,14 +1010,14 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
return -ENODEV;
}
- table = rcu_dereference(dp->table);
+ ti = rcu_dereference(dp->table.ti);
for (;;) {
struct sw_flow *flow;
u32 bucket, obj;
bucket = cb->args[0];
obj = cb->args[1];
- flow = ovs_flow_dump_next(table, &bucket, &obj);
+ flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj);
if (!flow)
break;
@@ -1589,6 +1083,7 @@ static size_t ovs_dp_cmd_msg_size(void)
msgsize += nla_total_size(IFNAMSIZ);
msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
+ msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
return msgsize;
}
@@ -1598,6 +1093,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
{
struct ovs_header *ovs_header;
struct ovs_dp_stats dp_stats;
+ struct ovs_dp_megaflow_stats dp_megaflow_stats;
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
@@ -1613,8 +1109,14 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
if (err)
goto nla_put_failure;
- get_dp_stats(dp, &dp_stats);
- if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats))
+ get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
+ if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
+ &dp_stats))
+ goto nla_put_failure;
+
+ if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
+ sizeof(struct ovs_dp_megaflow_stats),
+ &dp_megaflow_stats))
goto nla_put_failure;
return genlmsg_end(skb, ovs_header);
@@ -1687,9 +1189,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
ovs_dp_set_net(dp, hold_net(sock_net(skb->sk)));
/* Allocate table. */
- err = -ENOMEM;
- rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS));
- if (!dp->table)
+ err = ovs_flow_tbl_init(&dp->table);
+ if (err)
goto err_free_dp;
dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
@@ -1699,7 +1200,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
- GFP_KERNEL);
+ GFP_KERNEL);
if (!dp->ports) {
err = -ENOMEM;
goto err_destroy_percpu;
@@ -1746,7 +1247,7 @@ err_destroy_ports_array:
err_destroy_percpu:
free_percpu(dp->stats_percpu);
err_destroy_table:
- ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false);
+ ovs_flow_tbl_destroy(&dp->table);
err_free_dp:
release_net(ovs_dp_get_net(dp));
kfree(dp);
@@ -2336,32 +1837,6 @@ error:
return err;
}
-static void rehash_flow_table(struct work_struct *work)
-{
- struct datapath *dp;
- struct net *net;
-
- ovs_lock();
- rtnl_lock();
- for_each_net(net) {
- struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
-
- list_for_each_entry(dp, &ovs_net->dps, list_node) {
- struct flow_table *old_table = ovsl_dereference(dp->table);
- struct flow_table *new_table;
-
- new_table = ovs_flow_tbl_rehash(old_table);
- if (!IS_ERR(new_table)) {
- rcu_assign_pointer(dp->table, new_table);
- ovs_flow_tbl_destroy(old_table, true);
- }
- }
- }
- rtnl_unlock();
- ovs_unlock();
- schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
-}
-
static int __net_init ovs_init_net(struct net *net)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
@@ -2419,8 +1894,6 @@ static int __init dp_init(void)
if (err < 0)
goto error_unreg_notifier;
- schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
-
return 0;
error_unreg_notifier:
@@ -2437,7 +1910,6 @@ error:
static void dp_cleanup(void)
{
- cancel_delayed_work_sync(&rehash_flow_wq);
dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
unregister_netdevice_notifier(&ovs_dp_device_notifier);
unregister_pernet_device(&ovs_net_ops);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 4d109c176ef3..d3d14a58aa91 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -27,6 +27,7 @@
#include <linux/u64_stats_sync.h>
#include "flow.h"
+#include "flow_table.h"
#include "vport.h"
#define DP_MAX_PORTS USHRT_MAX
@@ -45,11 +46,15 @@
* @n_lost: Number of received packets that had no matching flow in the flow
* table that could not be sent to userspace (normally due to an overflow in
* one of the datapath's queues).
+ * @n_mask_hit: Number of masks looked up for flow match.
+ * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked
+ * up per packet.
*/
struct dp_stats_percpu {
u64 n_hit;
u64 n_missed;
u64 n_lost;
+ u64 n_mask_hit;
struct u64_stats_sync sync;
};
@@ -57,7 +62,7 @@ struct dp_stats_percpu {
* struct datapath - datapath for flow-based packet switching
* @rcu: RCU callback head for deferred destruction.
* @list_node: Element in global 'dps' list.
- * @table: Current flow table. Protected by ovs_mutex and RCU.
+ * @table: flow table.
* @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by
* ovs_mutex and RCU.
* @stats_percpu: Per-CPU datapath statistics.
@@ -71,7 +76,7 @@ struct datapath {
struct list_head list_node;
/* Flow table. */
- struct flow_table __rcu *table;
+ struct flow_table table;
/* Switch ports. */
struct hlist_head *ports;
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index c3235675f359..5c2dab276109 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -65,8 +65,7 @@ void ovs_dp_notify_wq(struct work_struct *work)
continue;
netdev_vport = netdev_vport_priv(vport);
- if (netdev_vport->dev->reg_state == NETREG_UNREGISTERED ||
- netdev_vport->dev->reg_state == NETREG_UNREGISTERING)
+ if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH))
dp_detach_port_notify(vport);
}
}
@@ -88,6 +87,10 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event,
return NOTIFY_DONE;
if (event == NETDEV_UNREGISTER) {
+ /* upper_dev_unlink and decrement promisc immediately */
+ ovs_netdev_detach_dev(vport);
+
+ /* schedule vport destroy, dev_put and genl notification */
ovs_net = net_generic(dev_net(dev), ovs_net_id);
queue_work(system_wq, &ovs_net->dp_notify_work);
}
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 410db90db73d..b409f5279601 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -45,202 +45,38 @@
#include <net/ipv6.h>
#include <net/ndisc.h>
-static struct kmem_cache *flow_cache;
-
-static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
- struct sw_flow_key_range *range, u8 val);
-
-static void update_range__(struct sw_flow_match *match,
- size_t offset, size_t size, bool is_mask)
+u64 ovs_flow_used_time(unsigned long flow_jiffies)
{
- struct sw_flow_key_range *range = NULL;
- size_t start = rounddown(offset, sizeof(long));
- size_t end = roundup(offset + size, sizeof(long));
-
- if (!is_mask)
- range = &match->range;
- else if (match->mask)
- range = &match->mask->range;
-
- if (!range)
- return;
-
- if (range->start == range->end) {
- range->start = start;
- range->end = end;
- return;
- }
-
- if (range->start > start)
- range->start = start;
+ struct timespec cur_ts;
+ u64 cur_ms, idle_ms;
- if (range->end < end)
- range->end = end;
-}
+ ktime_get_ts(&cur_ts);
+ idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
+ cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
+ cur_ts.tv_nsec / NSEC_PER_MSEC;
-#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
- do { \
- update_range__(match, offsetof(struct sw_flow_key, field), \
- sizeof((match)->key->field), is_mask); \
- if (is_mask) { \
- if ((match)->mask) \
- (match)->mask->key.field = value; \
- } else { \
- (match)->key->field = value; \
- } \
- } while (0)
-
-#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
- do { \
- update_range__(match, offsetof(struct sw_flow_key, field), \
- len, is_mask); \
- if (is_mask) { \
- if ((match)->mask) \
- memcpy(&(match)->mask->key.field, value_p, len);\
- } else { \
- memcpy(&(match)->key->field, value_p, len); \
- } \
- } while (0)
-
-static u16 range_n_bytes(const struct sw_flow_key_range *range)
-{
- return range->end - range->start;
+ return cur_ms - idle_ms;
}
-void ovs_match_init(struct sw_flow_match *match,
- struct sw_flow_key *key,
- struct sw_flow_mask *mask)
-{
- memset(match, 0, sizeof(*match));
- match->key = key;
- match->mask = mask;
+#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF))
- memset(key, 0, sizeof(*key));
-
- if (mask) {
- memset(&mask->key, 0, sizeof(mask->key));
- mask->range.start = mask->range.end = 0;
- }
-}
-
-static bool ovs_match_validate(const struct sw_flow_match *match,
- u64 key_attrs, u64 mask_attrs)
+void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
{
- u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
- u64 mask_allowed = key_attrs; /* At most allow all key attributes */
-
- /* The following mask attributes allowed only if they
- * pass the validation tests. */
- mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
- | (1 << OVS_KEY_ATTR_IPV6)
- | (1 << OVS_KEY_ATTR_TCP)
- | (1 << OVS_KEY_ATTR_UDP)
- | (1 << OVS_KEY_ATTR_SCTP)
- | (1 << OVS_KEY_ATTR_ICMP)
- | (1 << OVS_KEY_ATTR_ICMPV6)
- | (1 << OVS_KEY_ATTR_ARP)
- | (1 << OVS_KEY_ATTR_ND));
-
- /* Always allowed mask fields. */
- mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
- | (1 << OVS_KEY_ATTR_IN_PORT)
- | (1 << OVS_KEY_ATTR_ETHERTYPE));
-
- /* Check key attributes. */
- if (match->key->eth.type == htons(ETH_P_ARP)
- || match->key->eth.type == htons(ETH_P_RARP)) {
- key_expected |= 1 << OVS_KEY_ATTR_ARP;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
- mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
- }
+ __be16 tcp_flags = 0;
- if (match->key->eth.type == htons(ETH_P_IP)) {
- key_expected |= 1 << OVS_KEY_ATTR_IPV4;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
- mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
-
- if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
- if (match->key->ip.proto == IPPROTO_UDP) {
- key_expected |= 1 << OVS_KEY_ATTR_UDP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
- }
-
- if (match->key->ip.proto == IPPROTO_SCTP) {
- key_expected |= 1 << OVS_KEY_ATTR_SCTP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
- }
-
- if (match->key->ip.proto == IPPROTO_TCP) {
- key_expected |= 1 << OVS_KEY_ATTR_TCP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
- }
-
- if (match->key->ip.proto == IPPROTO_ICMP) {
- key_expected |= 1 << OVS_KEY_ATTR_ICMP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
- }
- }
- }
-
- if (match->key->eth.type == htons(ETH_P_IPV6)) {
- key_expected |= 1 << OVS_KEY_ATTR_IPV6;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
- mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
-
- if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
- if (match->key->ip.proto == IPPROTO_UDP) {
- key_expected |= 1 << OVS_KEY_ATTR_UDP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
- }
-
- if (match->key->ip.proto == IPPROTO_SCTP) {
- key_expected |= 1 << OVS_KEY_ATTR_SCTP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
- }
-
- if (match->key->ip.proto == IPPROTO_TCP) {
- key_expected |= 1 << OVS_KEY_ATTR_TCP;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
- }
-
- if (match->key->ip.proto == IPPROTO_ICMPV6) {
- key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
- if (match->mask && (match->mask->key.ip.proto == 0xff))
- mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
-
- if (match->key->ipv6.tp.src ==
- htons(NDISC_NEIGHBOUR_SOLICITATION) ||
- match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
- key_expected |= 1 << OVS_KEY_ATTR_ND;
- if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff)))
- mask_allowed |= 1 << OVS_KEY_ATTR_ND;
- }
- }
- }
- }
-
- if ((key_attrs & key_expected) != key_expected) {
- /* Key attributes check failed. */
- OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
- key_attrs, key_expected);
- return false;
- }
-
- if ((mask_attrs & mask_allowed) != mask_attrs) {
- /* Mask attributes check failed. */
- OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
- mask_attrs, mask_allowed);
- return false;
+ if ((flow->key.eth.type == htons(ETH_P_IP) ||
+ flow->key.eth.type == htons(ETH_P_IPV6)) &&
+ flow->key.ip.proto == IPPROTO_TCP &&
+ likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) {
+ tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
}
- return true;
+ spin_lock(&flow->lock);
+ flow->used = jiffies;
+ flow->packet_count++;
+ flow->byte_count += skb->len;
+ flow->tcp_flags |= tcp_flags;
+ spin_unlock(&flow->lock);
}
static int check_header(struct sk_buff *skb, int len)
@@ -311,19 +147,6 @@ static bool icmphdr_ok(struct sk_buff *skb)
sizeof(struct icmphdr));
}
-u64 ovs_flow_used_time(unsigned long flow_jiffies)
-{
- struct timespec cur_ts;
- u64 cur_ms, idle_ms;
-
- ktime_get_ts(&cur_ts);
- idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
- cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
- cur_ts.tv_nsec / NSEC_PER_MSEC;
-
- return cur_ms - idle_ms;
-}
-
static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
{
unsigned int nh_ofs = skb_network_offset(skb);
@@ -372,311 +195,6 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
sizeof(struct icmp6hdr));
}
-void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
- const struct sw_flow_mask *mask)
-{
- const long *m = (long *)((u8 *)&mask->key + mask->range.start);
- const long *s = (long *)((u8 *)src + mask->range.start);
- long *d = (long *)((u8 *)dst + mask->range.start);
- int i;
-
- /* The memory outside of the 'mask->range' are not set since
- * further operations on 'dst' only uses contents within
- * 'mask->range'.
- */
- for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
- *d++ = *s++ & *m++;
-}
-
-#define TCP_FLAGS_OFFSET 13
-#define TCP_FLAG_MASK 0x3f
-
-void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
-{
- u8 tcp_flags = 0;
-
- if ((flow->key.eth.type == htons(ETH_P_IP) ||
- flow->key.eth.type == htons(ETH_P_IPV6)) &&
- flow->key.ip.proto == IPPROTO_TCP &&
- likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) {
- u8 *tcp = (u8 *)tcp_hdr(skb);
- tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK;
- }
-
- spin_lock(&flow->lock);
- flow->used = jiffies;
- flow->packet_count++;
- flow->byte_count += skb->len;
- flow->tcp_flags |= tcp_flags;
- spin_unlock(&flow->lock);
-}
-
-struct sw_flow_actions *ovs_flow_actions_alloc(int size)
-{
- struct sw_flow_actions *sfa;
-
- if (size > MAX_ACTIONS_BUFSIZE)
- return ERR_PTR(-EINVAL);
-
- sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
- if (!sfa)
- return ERR_PTR(-ENOMEM);
-
- sfa->actions_len = 0;
- return sfa;
-}
-
-struct sw_flow *ovs_flow_alloc(void)
-{
- struct sw_flow *flow;
-
- flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
- if (!flow)
- return ERR_PTR(-ENOMEM);
-
- spin_lock_init(&flow->lock);
- flow->sf_acts = NULL;
- flow->mask = NULL;
-
- return flow;
-}
-
-static struct hlist_head *find_bucket(struct flow_table *table, u32 hash)
-{
- hash = jhash_1word(hash, table->hash_seed);
- return flex_array_get(table->buckets,
- (hash & (table->n_buckets - 1)));
-}
-
-static struct flex_array *alloc_buckets(unsigned int n_buckets)
-{
- struct flex_array *buckets;
- int i, err;
-
- buckets = flex_array_alloc(sizeof(struct hlist_head),
- n_buckets, GFP_KERNEL);
- if (!buckets)
- return NULL;
-
- err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
- if (err) {
- flex_array_free(buckets);
- return NULL;
- }
-
- for (i = 0; i < n_buckets; i++)
- INIT_HLIST_HEAD((struct hlist_head *)
- flex_array_get(buckets, i));
-
- return buckets;
-}
-
-static void free_buckets(struct flex_array *buckets)
-{
- flex_array_free(buckets);
-}
-
-static struct flow_table *__flow_tbl_alloc(int new_size)
-{
- struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
-
- if (!table)
- return NULL;
-
- table->buckets = alloc_buckets(new_size);
-
- if (!table->buckets) {
- kfree(table);
- return NULL;
- }
- table->n_buckets = new_size;
- table->count = 0;
- table->node_ver = 0;
- table->keep_flows = false;
- get_random_bytes(&table->hash_seed, sizeof(u32));
- table->mask_list = NULL;
-
- return table;
-}
-
-static void __flow_tbl_destroy(struct flow_table *table)
-{
- int i;
-
- if (table->keep_flows)
- goto skip_flows;
-
- for (i = 0; i < table->n_buckets; i++) {
- struct sw_flow *flow;
- struct hlist_head *head = flex_array_get(table->buckets, i);
- struct hlist_node *n;
- int ver = table->node_ver;
-
- hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
- hlist_del(&flow->hash_node[ver]);
- ovs_flow_free(flow, false);
- }
- }
-
- BUG_ON(!list_empty(table->mask_list));
- kfree(table->mask_list);
-
-skip_flows:
- free_buckets(table->buckets);
- kfree(table);
-}
-
-struct flow_table *ovs_flow_tbl_alloc(int new_size)
-{
- struct flow_table *table = __flow_tbl_alloc(new_size);
-
- if (!table)
- return NULL;
-
- table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL);
- if (!table->mask_list) {
- table->keep_flows = true;
- __flow_tbl_destroy(table);
- return NULL;
- }
- INIT_LIST_HEAD(table->mask_list);
-
- return table;
-}
-
-static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
-{
- struct flow_table *table = container_of(rcu, struct flow_table, rcu);
-
- __flow_tbl_destroy(table);
-}
-
-void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
-{
- if (!table)
- return;
-
- if (deferred)
- call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
- else
- __flow_tbl_destroy(table);
-}
-
-struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last)
-{
- struct sw_flow *flow;
- struct hlist_head *head;
- int ver;
- int i;
-
- ver = table->node_ver;
- while (*bucket < table->n_buckets) {
- i = 0;
- head = flex_array_get(table->buckets, *bucket);
- hlist_for_each_entry_rcu(flow, head, hash_node[ver]) {
- if (i < *last) {
- i++;
- continue;
- }
- *last = i + 1;
- return flow;
- }
- (*bucket)++;
- *last = 0;
- }
-
- return NULL;
-}
-
-static void __tbl_insert(struct flow_table *table, struct sw_flow *flow)
-{
- struct hlist_head *head;
-
- head = find_bucket(table, flow->hash);
- hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
-
- table->count++;
-}
-
-static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
-{
- int old_ver;
- int i;
-
- old_ver = old->node_ver;
- new->node_ver = !old_ver;
-
- /* Insert in new table. */
- for (i = 0; i < old->n_buckets; i++) {
- struct sw_flow *flow;
- struct hlist_head *head;
-
- head = flex_array_get(old->buckets, i);
-
- hlist_for_each_entry(flow, head, hash_node[old_ver])
- __tbl_insert(new, flow);
- }
-
- new->mask_list = old->mask_list;
- old->keep_flows = true;
-}
-
-static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets)
-{
- struct flow_table *new_table;
-
- new_table = __flow_tbl_alloc(n_buckets);
- if (!new_table)
- return ERR_PTR(-ENOMEM);
-
- flow_table_copy_flows(table, new_table);
-
- return new_table;
-}
-
-struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table)
-{
- return __flow_tbl_rehash(table, table->n_buckets);
-}
-
-struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
-{
- return __flow_tbl_rehash(table, table->n_buckets * 2);
-}
-
-static void __flow_free(struct sw_flow *flow)
-{
- kfree((struct sf_flow_acts __force *)flow->sf_acts);
- kmem_cache_free(flow_cache, flow);
-}
-
-static void rcu_free_flow_callback(struct rcu_head *rcu)
-{
- struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
-
- __flow_free(flow);
-}
-
-void ovs_flow_free(struct sw_flow *flow, bool deferred)
-{
- if (!flow)
- return;
-
- ovs_sw_flow_mask_del_ref(flow->mask, deferred);
-
- if (deferred)
- call_rcu(&flow->rcu, rcu_free_flow_callback);
- else
- __flow_free(flow);
-}
-
-/* Schedules 'sf_acts' to be freed after the next RCU grace period.
- * The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts)
-{
- kfree_rcu(sf_acts, rcu);
-}
-
static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
{
struct qtag_prefix {
@@ -910,6 +428,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
struct tcphdr *tcp = tcp_hdr(skb);
key->ipv4.tp.src = tcp->source;
key->ipv4.tp.dst = tcp->dest;
+ key->ipv4.tp.flags = TCP_FLAGS_BE16(tcp);
}
} else if (key->ip.proto == IPPROTO_UDP) {
if (udphdr_ok(skb)) {
@@ -978,6 +497,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
struct tcphdr *tcp = tcp_hdr(skb);
key->ipv6.tp.src = tcp->source;
key->ipv6.tp.dst = tcp->dest;
+ key->ipv6.tp.flags = TCP_FLAGS_BE16(tcp);
}
} else if (key->ip.proto == NEXTHDR_UDP) {
if (udphdr_ok(skb)) {
@@ -1002,1080 +522,3 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
return 0;
}
-
-static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start,
- int key_end)
-{
- u32 *hash_key = (u32 *)((u8 *)key + key_start);
- int hash_u32s = (key_end - key_start) >> 2;
-
- /* Make sure number of hash bytes are multiple of u32. */
- BUILD_BUG_ON(sizeof(long) % sizeof(u32));
-
- return jhash2(hash_key, hash_u32s, 0);
-}
-
-static int flow_key_start(const struct sw_flow_key *key)
-{
- if (key->tun_key.ipv4_dst)
- return 0;
- else
- return rounddown(offsetof(struct sw_flow_key, phy),
- sizeof(long));
-}
-
-static bool __cmp_key(const struct sw_flow_key *key1,
- const struct sw_flow_key *key2, int key_start, int key_end)
-{
- const long *cp1 = (long *)((u8 *)key1 + key_start);
- const long *cp2 = (long *)((u8 *)key2 + key_start);
- long diffs = 0;
- int i;
-
- for (i = key_start; i < key_end; i += sizeof(long))
- diffs |= *cp1++ ^ *cp2++;
-
- return diffs == 0;
-}
-
-static bool __flow_cmp_masked_key(const struct sw_flow *flow,
- const struct sw_flow_key *key, int key_start, int key_end)
-{
- return __cmp_key(&flow->key, key, key_start, key_end);
-}
-
-static bool __flow_cmp_unmasked_key(const struct sw_flow *flow,
- const struct sw_flow_key *key, int key_start, int key_end)
-{
- return __cmp_key(&flow->unmasked_key, key, key_start, key_end);
-}
-
-bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
- const struct sw_flow_key *key, int key_end)
-{
- int key_start;
- key_start = flow_key_start(key);
-
- return __flow_cmp_unmasked_key(flow, key, key_start, key_end);
-
-}
-
-struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
- struct sw_flow_match *match)
-{
- struct sw_flow_key *unmasked = match->key;
- int key_end = match->range.end;
- struct sw_flow *flow;
-
- flow = ovs_flow_lookup(table, unmasked);
- if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_end)))
- flow = NULL;
-
- return flow;
-}
-
-static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table,
- const struct sw_flow_key *unmasked,
- struct sw_flow_mask *mask)
-{
- struct sw_flow *flow;
- struct hlist_head *head;
- int key_start = mask->range.start;
- int key_end = mask->range.end;
- u32 hash;
- struct sw_flow_key masked_key;
-
- ovs_flow_key_mask(&masked_key, unmasked, mask);
- hash = ovs_flow_hash(&masked_key, key_start, key_end);
- head = find_bucket(table, hash);
- hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) {
- if (flow->mask == mask &&
- __flow_cmp_masked_key(flow, &masked_key,
- key_start, key_end))
- return flow;
- }
- return NULL;
-}
-
-struct sw_flow *ovs_flow_lookup(struct flow_table *tbl,
- const struct sw_flow_key *key)
-{
- struct sw_flow *flow = NULL;
- struct sw_flow_mask *mask;
-
- list_for_each_entry_rcu(mask, tbl->mask_list, list) {
- flow = ovs_masked_flow_lookup(tbl, key, mask);
- if (flow) /* Found */
- break;
- }
-
- return flow;
-}
-
-
-void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow)
-{
- flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start,
- flow->mask->range.end);
- __tbl_insert(table, flow);
-}
-
-void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow)
-{
- BUG_ON(table->count == 0);
- hlist_del_rcu(&flow->hash_node[table->node_ver]);
- table->count--;
-}
-
-/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
-const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
- [OVS_KEY_ATTR_ENCAP] = -1,
- [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
- [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
- [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32),
- [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
- [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
- [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
- [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
- [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
- [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
- [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
- [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
- [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
- [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
- [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
- [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
- [OVS_KEY_ATTR_TUNNEL] = -1,
-};
-
-static bool is_all_zero(const u8 *fp, size_t size)
-{
- int i;
-
- if (!fp)
- return false;
-
- for (i = 0; i < size; i++)
- if (fp[i])
- return false;
-
- return true;
-}
-
-static int __parse_flow_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[],
- u64 *attrsp, bool nz)
-{
- const struct nlattr *nla;
- u32 attrs;
- int rem;
-
- attrs = *attrsp;
- nla_for_each_nested(nla, attr, rem) {
- u16 type = nla_type(nla);
- int expected_len;
-
- if (type > OVS_KEY_ATTR_MAX) {
- OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n",
- type, OVS_KEY_ATTR_MAX);
- return -EINVAL;
- }
-
- if (attrs & (1 << type)) {
- OVS_NLERR("Duplicate key attribute (type %d).\n", type);
- return -EINVAL;
- }
-
- expected_len = ovs_key_lens[type];
- if (nla_len(nla) != expected_len && expected_len != -1) {
- OVS_NLERR("Key attribute has unexpected length (type=%d"
- ", length=%d, expected=%d).\n", type,
- nla_len(nla), expected_len);
- return -EINVAL;
- }
-
- if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
- attrs |= 1 << type;
- a[type] = nla;
- }
- }
- if (rem) {
- OVS_NLERR("Message has %d unknown bytes.\n", rem);
- return -EINVAL;
- }
-
- *attrsp = attrs;
- return 0;
-}
-
-static int parse_flow_mask_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[], u64 *attrsp)
-{
- return __parse_flow_nlattrs(attr, a, attrsp, true);
-}
-
-static int parse_flow_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[], u64 *attrsp)
-{
- return __parse_flow_nlattrs(attr, a, attrsp, false);
-}
-
-int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
- struct sw_flow_match *match, bool is_mask)
-{
- struct nlattr *a;
- int rem;
- bool ttl = false;
- __be16 tun_flags = 0;
-
- nla_for_each_nested(a, attr, rem) {
- int type = nla_type(a);
- static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
- [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
- [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
- [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
- [OVS_TUNNEL_KEY_ATTR_TOS] = 1,
- [OVS_TUNNEL_KEY_ATTR_TTL] = 1,
- [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
- [OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
- };
-
- if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
- OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n",
- type, OVS_TUNNEL_KEY_ATTR_MAX);
- return -EINVAL;
- }
-
- if (ovs_tunnel_key_lens[type] != nla_len(a)) {
- OVS_NLERR("IPv4 tunnel attribute type has unexpected "
- " length (type=%d, length=%d, expected=%d).\n",
- type, nla_len(a), ovs_tunnel_key_lens[type]);
- return -EINVAL;
- }
-
- switch (type) {
- case OVS_TUNNEL_KEY_ATTR_ID:
- SW_FLOW_KEY_PUT(match, tun_key.tun_id,
- nla_get_be64(a), is_mask);
- tun_flags |= TUNNEL_KEY;
- break;
- case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
- nla_get_be32(a), is_mask);
- break;
- case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
- nla_get_be32(a), is_mask);
- break;
- case OVS_TUNNEL_KEY_ATTR_TOS:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
- nla_get_u8(a), is_mask);
- break;
- case OVS_TUNNEL_KEY_ATTR_TTL:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
- nla_get_u8(a), is_mask);
- ttl = true;
- break;
- case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
- tun_flags |= TUNNEL_DONT_FRAGMENT;
- break;
- case OVS_TUNNEL_KEY_ATTR_CSUM:
- tun_flags |= TUNNEL_CSUM;
- break;
- default:
- return -EINVAL;
- }
- }
-
- SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
-
- if (rem > 0) {
- OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem);
- return -EINVAL;
- }
-
- if (!is_mask) {
- if (!match->key->tun_key.ipv4_dst) {
- OVS_NLERR("IPv4 tunnel destination address is zero.\n");
- return -EINVAL;
- }
-
- if (!ttl) {
- OVS_NLERR("IPv4 tunnel TTL not specified.\n");
- return -EINVAL;
- }
- }
-
- return 0;
-}
-
-int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *tun_key,
- const struct ovs_key_ipv4_tunnel *output)
-{
- struct nlattr *nla;
-
- nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
- if (!nla)
- return -EMSGSIZE;
-
- if (output->tun_flags & TUNNEL_KEY &&
- nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
- return -EMSGSIZE;
- if (output->ipv4_src &&
- nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
- return -EMSGSIZE;
- if (output->ipv4_dst &&
- nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
- return -EMSGSIZE;
- if (output->ipv4_tos &&
- nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
- return -EMSGSIZE;
- if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
- return -EMSGSIZE;
- if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
- nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
- return -EMSGSIZE;
- if ((output->tun_flags & TUNNEL_CSUM) &&
- nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
- return -EMSGSIZE;
-
- nla_nest_end(skb, nla);
- return 0;
-}
-
-static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
- const struct nlattr **a, bool is_mask)
-{
- if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
- SW_FLOW_KEY_PUT(match, phy.priority,
- nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
- *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
- }
-
- if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
- u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
-
- if (is_mask)
- in_port = 0xffffffff; /* Always exact match in_port. */
- else if (in_port >= DP_MAX_PORTS)
- return -EINVAL;
-
- SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
- *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
- } else if (!is_mask) {
- SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
- }
-
- if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
- uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
-
- SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
- *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
- }
- if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
- if (ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
- is_mask))
- return -EINVAL;
- *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
- }
- return 0;
-}
-
-static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
- const struct nlattr **a, bool is_mask)
-{
- int err;
- u64 orig_attrs = attrs;
-
- err = metadata_from_nlattrs(match, &attrs, a, is_mask);
- if (err)
- return err;
-
- if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
- const struct ovs_key_ethernet *eth_key;
-
- eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
- SW_FLOW_KEY_MEMCPY(match, eth.src,
- eth_key->eth_src, ETH_ALEN, is_mask);
- SW_FLOW_KEY_MEMCPY(match, eth.dst,
- eth_key->eth_dst, ETH_ALEN, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
- __be16 tci;
-
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- if (is_mask)
- OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n");
- else
- OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n");
-
- return -EINVAL;
- }
-
- SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
- } else if (!is_mask)
- SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
-
- if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
- __be16 eth_type;
-
- eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
- if (is_mask) {
- /* Always exact match EtherType. */
- eth_type = htons(0xffff);
- } else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
- OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n",
- ntohs(eth_type), ETH_P_802_3_MIN);
- return -EINVAL;
- }
-
- SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- } else if (!is_mask) {
- SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
- const struct ovs_key_ipv4 *ipv4_key;
-
- ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
- if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
- OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n",
- ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
- return -EINVAL;
- }
- SW_FLOW_KEY_PUT(match, ip.proto,
- ipv4_key->ipv4_proto, is_mask);
- SW_FLOW_KEY_PUT(match, ip.tos,
- ipv4_key->ipv4_tos, is_mask);
- SW_FLOW_KEY_PUT(match, ip.ttl,
- ipv4_key->ipv4_ttl, is_mask);
- SW_FLOW_KEY_PUT(match, ip.frag,
- ipv4_key->ipv4_frag, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.addr.src,
- ipv4_key->ipv4_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
- ipv4_key->ipv4_dst, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
- const struct ovs_key_ipv6 *ipv6_key;
-
- ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
- if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
- OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n",
- ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
- return -EINVAL;
- }
- SW_FLOW_KEY_PUT(match, ipv6.label,
- ipv6_key->ipv6_label, is_mask);
- SW_FLOW_KEY_PUT(match, ip.proto,
- ipv6_key->ipv6_proto, is_mask);
- SW_FLOW_KEY_PUT(match, ip.tos,
- ipv6_key->ipv6_tclass, is_mask);
- SW_FLOW_KEY_PUT(match, ip.ttl,
- ipv6_key->ipv6_hlimit, is_mask);
- SW_FLOW_KEY_PUT(match, ip.frag,
- ipv6_key->ipv6_frag, is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
- ipv6_key->ipv6_src,
- sizeof(match->key->ipv6.addr.src),
- is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
- ipv6_key->ipv6_dst,
- sizeof(match->key->ipv6.addr.dst),
- is_mask);
-
- attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
- const struct ovs_key_arp *arp_key;
-
- arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
- if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
- OVS_NLERR("Unknown ARP opcode (opcode=%d).\n",
- arp_key->arp_op);
- return -EINVAL;
- }
-
- SW_FLOW_KEY_PUT(match, ipv4.addr.src,
- arp_key->arp_sip, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
- arp_key->arp_tip, is_mask);
- SW_FLOW_KEY_PUT(match, ip.proto,
- ntohs(arp_key->arp_op), is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
- arp_key->arp_sha, ETH_ALEN, is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
- arp_key->arp_tha, ETH_ALEN, is_mask);
-
- attrs &= ~(1 << OVS_KEY_ATTR_ARP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
- const struct ovs_key_tcp *tcp_key;
-
- tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
- if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
- SW_FLOW_KEY_PUT(match, ipv4.tp.src,
- tcp_key->tcp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
- tcp_key->tcp_dst, is_mask);
- } else {
- SW_FLOW_KEY_PUT(match, ipv6.tp.src,
- tcp_key->tcp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
- tcp_key->tcp_dst, is_mask);
- }
- attrs &= ~(1 << OVS_KEY_ATTR_TCP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
- const struct ovs_key_udp *udp_key;
-
- udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
- if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
- SW_FLOW_KEY_PUT(match, ipv4.tp.src,
- udp_key->udp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
- udp_key->udp_dst, is_mask);
- } else {
- SW_FLOW_KEY_PUT(match, ipv6.tp.src,
- udp_key->udp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
- udp_key->udp_dst, is_mask);
- }
- attrs &= ~(1 << OVS_KEY_ATTR_UDP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
- const struct ovs_key_sctp *sctp_key;
-
- sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
- if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
- SW_FLOW_KEY_PUT(match, ipv4.tp.src,
- sctp_key->sctp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
- sctp_key->sctp_dst, is_mask);
- } else {
- SW_FLOW_KEY_PUT(match, ipv6.tp.src,
- sctp_key->sctp_src, is_mask);
- SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
- sctp_key->sctp_dst, is_mask);
- }
- attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
- const struct ovs_key_icmp *icmp_key;
-
- icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
- SW_FLOW_KEY_PUT(match, ipv4.tp.src,
- htons(icmp_key->icmp_type), is_mask);
- SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
- htons(icmp_key->icmp_code), is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
- const struct ovs_key_icmpv6 *icmpv6_key;
-
- icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
- SW_FLOW_KEY_PUT(match, ipv6.tp.src,
- htons(icmpv6_key->icmpv6_type), is_mask);
- SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
- htons(icmpv6_key->icmpv6_code), is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ND)) {
- const struct ovs_key_nd *nd_key;
-
- nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
- SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
- nd_key->nd_target,
- sizeof(match->key->ipv6.nd.target),
- is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
- nd_key->nd_sll, ETH_ALEN, is_mask);
- SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
- nd_key->nd_tll, ETH_ALEN, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ND);
- }
-
- if (attrs != 0)
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and
- * mask. In case the 'mask' is NULL, the flow is treated as exact match
- * flow. Otherwise, it is treated as a wildcarded flow, except the mask
- * does not include any don't care bit.
- * @match: receives the extracted flow match information.
- * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence. The fields should of the packet that triggered the creation
- * of this flow.
- * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
- * attribute specifies the mask field of the wildcarded flow.
- */
-int ovs_match_from_nlattrs(struct sw_flow_match *match,
- const struct nlattr *key,
- const struct nlattr *mask)
-{
- const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
- const struct nlattr *encap;
- u64 key_attrs = 0;
- u64 mask_attrs = 0;
- bool encap_valid = false;
- int err;
-
- err = parse_flow_nlattrs(key, a, &key_attrs);
- if (err)
- return err;
-
- if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
- (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
- (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
- __be16 tci;
-
- if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
- (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
- OVS_NLERR("Invalid Vlan frame.\n");
- return -EINVAL;
- }
-
- key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- encap = a[OVS_KEY_ATTR_ENCAP];
- key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
- encap_valid = true;
-
- if (tci & htons(VLAN_TAG_PRESENT)) {
- err = parse_flow_nlattrs(encap, a, &key_attrs);
- if (err)
- return err;
- } else if (!tci) {
- /* Corner case for truncated 802.1Q header. */
- if (nla_len(encap)) {
- OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n");
- return -EINVAL;
- }
- } else {
- OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n");
- return -EINVAL;
- }
- }
-
- err = ovs_key_from_nlattrs(match, key_attrs, a, false);
- if (err)
- return err;
-
- if (mask) {
- err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
- if (err)
- return err;
-
- if (mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP) {
- __be16 eth_type = 0;
- __be16 tci = 0;
-
- if (!encap_valid) {
- OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n");
- return -EINVAL;
- }
-
- mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
- if (a[OVS_KEY_ATTR_ETHERTYPE])
- eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
-
- if (eth_type == htons(0xffff)) {
- mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- encap = a[OVS_KEY_ATTR_ENCAP];
- err = parse_flow_mask_nlattrs(encap, a, &mask_attrs);
- } else {
- OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n",
- ntohs(eth_type));
- return -EINVAL;
- }
-
- if (a[OVS_KEY_ATTR_VLAN])
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
-
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci));
- return -EINVAL;
- }
- }
-
- err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
- if (err)
- return err;
- } else {
- /* Populate exact match flow's key mask. */
- if (match->mask)
- ovs_sw_flow_mask_set(match->mask, &match->range, 0xff);
- }
-
- if (!ovs_match_validate(match, key_attrs, mask_attrs))
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
- * @flow: Receives extracted in_port, priority, tun_key and skb_mark.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
- *
- * This parses a series of Netlink attributes that form a flow key, which must
- * take the same form accepted by flow_from_nlattrs(), but only enough of it to
- * get the metadata, that is, the parts of the flow key that cannot be
- * extracted from the packet itself.
- */
-
-int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
- const struct nlattr *attr)
-{
- struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
- const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
- u64 attrs = 0;
- int err;
- struct sw_flow_match match;
-
- flow->key.phy.in_port = DP_MAX_PORTS;
- flow->key.phy.priority = 0;
- flow->key.phy.skb_mark = 0;
- memset(tun_key, 0, sizeof(flow->key.tun_key));
-
- err = parse_flow_nlattrs(attr, a, &attrs);
- if (err)
- return -EINVAL;
-
- memset(&match, 0, sizeof(match));
- match.key = &flow->key;
-
- err = metadata_from_nlattrs(&match, &attrs, a, false);
- if (err)
- return err;
-
- return 0;
-}
-
-int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey,
- const struct sw_flow_key *output, struct sk_buff *skb)
-{
- struct ovs_key_ethernet *eth_key;
- struct nlattr *nla, *encap;
- bool is_mask = (swkey != output);
-
- if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
- goto nla_put_failure;
-
- if ((swkey->tun_key.ipv4_dst || is_mask) &&
- ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
- goto nla_put_failure;
-
- if (swkey->phy.in_port == DP_MAX_PORTS) {
- if (is_mask && (output->phy.in_port == 0xffff))
- if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
- goto nla_put_failure;
- } else {
- u16 upper_u16;
- upper_u16 = !is_mask ? 0 : 0xffff;
-
- if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
- (upper_u16 << 16) | output->phy.in_port))
- goto nla_put_failure;
- }
-
- if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
- goto nla_put_failure;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
- if (!nla)
- goto nla_put_failure;
-
- eth_key = nla_data(nla);
- memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN);
- memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN);
-
- if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
- __be16 eth_type;
- eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
- nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
- goto nla_put_failure;
- encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
- if (!swkey->eth.tci)
- goto unencap;
- } else
- encap = NULL;
-
- if (swkey->eth.type == htons(ETH_P_802_2)) {
- /*
- * Ethertype 802.2 is represented in the netlink with omitted
- * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
- * 0xffff in the mask attribute. Ethertype can also
- * be wildcarded.
- */
- if (is_mask && output->eth.type)
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
- output->eth.type))
- goto nla_put_failure;
- goto unencap;
- }
-
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
- goto nla_put_failure;
-
- if (swkey->eth.type == htons(ETH_P_IP)) {
- struct ovs_key_ipv4 *ipv4_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
- if (!nla)
- goto nla_put_failure;
- ipv4_key = nla_data(nla);
- ipv4_key->ipv4_src = output->ipv4.addr.src;
- ipv4_key->ipv4_dst = output->ipv4.addr.dst;
- ipv4_key->ipv4_proto = output->ip.proto;
- ipv4_key->ipv4_tos = output->ip.tos;
- ipv4_key->ipv4_ttl = output->ip.ttl;
- ipv4_key->ipv4_frag = output->ip.frag;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- struct ovs_key_ipv6 *ipv6_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
- if (!nla)
- goto nla_put_failure;
- ipv6_key = nla_data(nla);
- memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
- sizeof(ipv6_key->ipv6_src));
- memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
- sizeof(ipv6_key->ipv6_dst));
- ipv6_key->ipv6_label = output->ipv6.label;
- ipv6_key->ipv6_proto = output->ip.proto;
- ipv6_key->ipv6_tclass = output->ip.tos;
- ipv6_key->ipv6_hlimit = output->ip.ttl;
- ipv6_key->ipv6_frag = output->ip.frag;
- } else if (swkey->eth.type == htons(ETH_P_ARP) ||
- swkey->eth.type == htons(ETH_P_RARP)) {
- struct ovs_key_arp *arp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
- if (!nla)
- goto nla_put_failure;
- arp_key = nla_data(nla);
- memset(arp_key, 0, sizeof(struct ovs_key_arp));
- arp_key->arp_sip = output->ipv4.addr.src;
- arp_key->arp_tip = output->ipv4.addr.dst;
- arp_key->arp_op = htons(output->ip.proto);
- memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
- memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
- }
-
- if ((swkey->eth.type == htons(ETH_P_IP) ||
- swkey->eth.type == htons(ETH_P_IPV6)) &&
- swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
-
- if (swkey->ip.proto == IPPROTO_TCP) {
- struct ovs_key_tcp *tcp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
- if (!nla)
- goto nla_put_failure;
- tcp_key = nla_data(nla);
- if (swkey->eth.type == htons(ETH_P_IP)) {
- tcp_key->tcp_src = output->ipv4.tp.src;
- tcp_key->tcp_dst = output->ipv4.tp.dst;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- tcp_key->tcp_src = output->ipv6.tp.src;
- tcp_key->tcp_dst = output->ipv6.tp.dst;
- }
- } else if (swkey->ip.proto == IPPROTO_UDP) {
- struct ovs_key_udp *udp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
- if (!nla)
- goto nla_put_failure;
- udp_key = nla_data(nla);
- if (swkey->eth.type == htons(ETH_P_IP)) {
- udp_key->udp_src = output->ipv4.tp.src;
- udp_key->udp_dst = output->ipv4.tp.dst;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- udp_key->udp_src = output->ipv6.tp.src;
- udp_key->udp_dst = output->ipv6.tp.dst;
- }
- } else if (swkey->ip.proto == IPPROTO_SCTP) {
- struct ovs_key_sctp *sctp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
- if (!nla)
- goto nla_put_failure;
- sctp_key = nla_data(nla);
- if (swkey->eth.type == htons(ETH_P_IP)) {
- sctp_key->sctp_src = swkey->ipv4.tp.src;
- sctp_key->sctp_dst = swkey->ipv4.tp.dst;
- } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- sctp_key->sctp_src = swkey->ipv6.tp.src;
- sctp_key->sctp_dst = swkey->ipv6.tp.dst;
- }
- } else if (swkey->eth.type == htons(ETH_P_IP) &&
- swkey->ip.proto == IPPROTO_ICMP) {
- struct ovs_key_icmp *icmp_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
- if (!nla)
- goto nla_put_failure;
- icmp_key = nla_data(nla);
- icmp_key->icmp_type = ntohs(output->ipv4.tp.src);
- icmp_key->icmp_code = ntohs(output->ipv4.tp.dst);
- } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
- swkey->ip.proto == IPPROTO_ICMPV6) {
- struct ovs_key_icmpv6 *icmpv6_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
- sizeof(*icmpv6_key));
- if (!nla)
- goto nla_put_failure;
- icmpv6_key = nla_data(nla);
- icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src);
- icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst);
-
- if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
- icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
- struct ovs_key_nd *nd_key;
-
- nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
- if (!nla)
- goto nla_put_failure;
- nd_key = nla_data(nla);
- memcpy(nd_key->nd_target, &output->ipv6.nd.target,
- sizeof(nd_key->nd_target));
- memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN);
- memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN);
- }
- }
- }
-
-unencap:
- if (encap)
- nla_nest_end(skb, encap);
-
- return 0;
-
-nla_put_failure:
- return -EMSGSIZE;
-}
-
-/* Initializes the flow module.
- * Returns zero if successful or a negative error code. */
-int ovs_flow_init(void)
-{
- BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
- BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
-
- flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
- 0, NULL);
- if (flow_cache == NULL)
- return -ENOMEM;
-
- return 0;
-}
-
-/* Uninitializes the flow module. */
-void ovs_flow_exit(void)
-{
- kmem_cache_destroy(flow_cache);
-}
-
-struct sw_flow_mask *ovs_sw_flow_mask_alloc(void)
-{
- struct sw_flow_mask *mask;
-
- mask = kmalloc(sizeof(*mask), GFP_KERNEL);
- if (mask)
- mask->ref_count = 0;
-
- return mask;
-}
-
-void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask)
-{
- mask->ref_count++;
-}
-
-void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred)
-{
- if (!mask)
- return;
-
- BUG_ON(!mask->ref_count);
- mask->ref_count--;
-
- if (!mask->ref_count) {
- list_del_rcu(&mask->list);
- if (deferred)
- kfree_rcu(mask, rcu);
- else
- kfree(mask);
- }
-}
-
-static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a,
- const struct sw_flow_mask *b)
-{
- u8 *a_ = (u8 *)&a->key + a->range.start;
- u8 *b_ = (u8 *)&b->key + b->range.start;
-
- return (a->range.end == b->range.end)
- && (a->range.start == b->range.start)
- && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
-}
-
-struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl,
- const struct sw_flow_mask *mask)
-{
- struct list_head *ml;
-
- list_for_each(ml, tbl->mask_list) {
- struct sw_flow_mask *m;
- m = container_of(ml, struct sw_flow_mask, list);
- if (ovs_sw_flow_mask_equal(mask, m))
- return m;
- }
-
- return NULL;
-}
-
-/**
- * add a new mask into the mask list.
- * The caller needs to make sure that 'mask' is not the same
- * as any masks that are already on the list.
- */
-void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask)
-{
- list_add_rcu(&mask->list, tbl->mask_list);
-}
-
-/**
- * Set 'range' fields in the mask to the value of 'val'.
- */
-static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
- struct sw_flow_key_range *range, u8 val)
-{
- u8 *m = (u8 *)&mask->key + range->start;
-
- mask->range = *range;
- memset(m, val, range_n_bytes(range));
-}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 212fbf7510c4..1510f51dbf74 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -33,14 +33,6 @@
#include <net/inet_ecn.h>
struct sk_buff;
-struct sw_flow_mask;
-struct flow_table;
-
-struct sw_flow_actions {
- struct rcu_head rcu;
- u32 actions_len;
- struct nlattr actions[];
-};
/* Used to memset ovs_key_ipv4_tunnel padding. */
#define OVS_TUNNEL_KEY_SIZE \
@@ -101,6 +93,7 @@ struct sw_flow_key {
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
+ __be16 flags; /* TCP flags. */
} tp;
struct {
u8 sha[ETH_ALEN]; /* ARP source hardware address. */
@@ -117,6 +110,7 @@ struct sw_flow_key {
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
+ __be16 flags; /* TCP flags. */
} tp;
struct {
struct in6_addr target; /* ND target address. */
@@ -127,6 +121,31 @@ struct sw_flow_key {
};
} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
+struct sw_flow_key_range {
+ size_t start;
+ size_t end;
+};
+
+struct sw_flow_mask {
+ int ref_count;
+ struct rcu_head rcu;
+ struct list_head list;
+ struct sw_flow_key_range range;
+ struct sw_flow_key key;
+};
+
+struct sw_flow_match {
+ struct sw_flow_key *key;
+ struct sw_flow_key_range range;
+ struct sw_flow_mask *mask;
+};
+
+struct sw_flow_actions {
+ struct rcu_head rcu;
+ u32 actions_len;
+ struct nlattr actions[];
+};
+
struct sw_flow {
struct rcu_head rcu;
struct hlist_node hash_node[2];
@@ -141,23 +160,9 @@ struct sw_flow {
unsigned long used; /* Last used time (in jiffies). */
u64 packet_count; /* Number of packets matched. */
u64 byte_count; /* Number of bytes matched. */
- u8 tcp_flags; /* Union of seen TCP flags. */
-};
-
-struct sw_flow_key_range {
- size_t start;
- size_t end;
+ __be16 tcp_flags; /* Union of seen TCP flags. */
};
-struct sw_flow_match {
- struct sw_flow_key *key;
- struct sw_flow_key_range range;
- struct sw_flow_mask *mask;
-};
-
-void ovs_match_init(struct sw_flow_match *match,
- struct sw_flow_key *key, struct sw_flow_mask *mask);
-
struct arp_eth_header {
__be16 ar_hrd; /* format of hardware address */
__be16 ar_pro; /* format of protocol address */
@@ -172,88 +177,9 @@ struct arp_eth_header {
unsigned char ar_tip[4]; /* target IP address */
} __packed;
-int ovs_flow_init(void);
-void ovs_flow_exit(void);
-
-struct sw_flow *ovs_flow_alloc(void);
-void ovs_flow_deferred_free(struct sw_flow *);
-void ovs_flow_free(struct sw_flow *, bool deferred);
-
-struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len);
-void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
-
-int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *);
void ovs_flow_used(struct sw_flow *, struct sk_buff *);
u64 ovs_flow_used_time(unsigned long flow_jiffies);
-int ovs_flow_to_nlattrs(const struct sw_flow_key *,
- const struct sw_flow_key *, struct sk_buff *);
-int ovs_match_from_nlattrs(struct sw_flow_match *match,
- const struct nlattr *,
- const struct nlattr *);
-int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
- const struct nlattr *attr);
-#define MAX_ACTIONS_BUFSIZE (32 * 1024)
-#define TBL_MIN_BUCKETS 1024
-
-struct flow_table {
- struct flex_array *buckets;
- unsigned int count, n_buckets;
- struct rcu_head rcu;
- struct list_head *mask_list;
- int node_ver;
- u32 hash_seed;
- bool keep_flows;
-};
-
-static inline int ovs_flow_tbl_count(struct flow_table *table)
-{
- return table->count;
-}
-
-static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table)
-{
- return (table->count > table->n_buckets);
-}
-
-struct sw_flow *ovs_flow_lookup(struct flow_table *,
- const struct sw_flow_key *);
-struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
- struct sw_flow_match *match);
-
-void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred);
-struct flow_table *ovs_flow_tbl_alloc(int new_size);
-struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
-struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
-
-void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow);
-void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow);
-
-struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *idx);
-extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
-int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
- struct sw_flow_match *match, bool is_mask);
-int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *tun_key,
- const struct ovs_key_ipv4_tunnel *output);
-
-bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
- const struct sw_flow_key *key, int key_end);
-
-struct sw_flow_mask {
- int ref_count;
- struct rcu_head rcu;
- struct list_head list;
- struct sw_flow_key_range range;
- struct sw_flow_key key;
-};
+int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *);
-struct sw_flow_mask *ovs_sw_flow_mask_alloc(void);
-void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *);
-void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred);
-void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *);
-struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *,
- const struct sw_flow_mask *);
-void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
- const struct sw_flow_mask *mask);
#endif /* flow.h */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
new file mode 100644
index 000000000000..2bc1bc1aca3b
--- /dev/null
+++ b/net/openvswitch/flow_netlink.c
@@ -0,0 +1,1630 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include "flow.h"
+#include "datapath.h"
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/sctp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+
+#include "flow_netlink.h"
+
+static void update_range__(struct sw_flow_match *match,
+ size_t offset, size_t size, bool is_mask)
+{
+ struct sw_flow_key_range *range = NULL;
+ size_t start = rounddown(offset, sizeof(long));
+ size_t end = roundup(offset + size, sizeof(long));
+
+ if (!is_mask)
+ range = &match->range;
+ else if (match->mask)
+ range = &match->mask->range;
+
+ if (!range)
+ return;
+
+ if (range->start == range->end) {
+ range->start = start;
+ range->end = end;
+ return;
+ }
+
+ if (range->start > start)
+ range->start = start;
+
+ if (range->end < end)
+ range->end = end;
+}
+
+#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
+ do { \
+ update_range__(match, offsetof(struct sw_flow_key, field), \
+ sizeof((match)->key->field), is_mask); \
+ if (is_mask) { \
+ if ((match)->mask) \
+ (match)->mask->key.field = value; \
+ } else { \
+ (match)->key->field = value; \
+ } \
+ } while (0)
+
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+ do { \
+ update_range__(match, offsetof(struct sw_flow_key, field), \
+ len, is_mask); \
+ if (is_mask) { \
+ if ((match)->mask) \
+ memcpy(&(match)->mask->key.field, value_p, len);\
+ } else { \
+ memcpy(&(match)->key->field, value_p, len); \
+ } \
+ } while (0)
+
+static u16 range_n_bytes(const struct sw_flow_key_range *range)
+{
+ return range->end - range->start;
+}
+
+static bool match_validate(const struct sw_flow_match *match,
+ u64 key_attrs, u64 mask_attrs)
+{
+ u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+ u64 mask_allowed = key_attrs; /* At most allow all key attributes */
+
+ /* The following mask attributes allowed only if they
+ * pass the validation tests. */
+ mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+ | (1 << OVS_KEY_ATTR_IPV6)
+ | (1 << OVS_KEY_ATTR_TCP)
+ | (1 << OVS_KEY_ATTR_TCP_FLAGS)
+ | (1 << OVS_KEY_ATTR_UDP)
+ | (1 << OVS_KEY_ATTR_SCTP)
+ | (1 << OVS_KEY_ATTR_ICMP)
+ | (1 << OVS_KEY_ATTR_ICMPV6)
+ | (1 << OVS_KEY_ATTR_ARP)
+ | (1 << OVS_KEY_ATTR_ND));
+
+ /* Always allowed mask fields. */
+ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
+ | (1 << OVS_KEY_ATTR_IN_PORT)
+ | (1 << OVS_KEY_ATTR_ETHERTYPE));
+
+ /* Check key attributes. */
+ if (match->key->eth.type == htons(ETH_P_ARP)
+ || match->key->eth.type == htons(ETH_P_RARP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ARP;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV4;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_SCTP) {
+ key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ if (match->mask && (match->mask->key.ip.proto == 0xff)) {
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ }
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMP) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
+ }
+ }
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IPV6)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV6;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_SCTP) {
+ key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ if (match->mask && (match->mask->key.ip.proto == 0xff)) {
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ }
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMPV6) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
+
+ if (match->key->ipv6.tp.src ==
+ htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ND;
+ if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ND;
+ }
+ }
+ }
+ }
+
+ if ((key_attrs & key_expected) != key_expected) {
+ /* Key attributes check failed. */
+ OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
+ key_attrs, key_expected);
+ return false;
+ }
+
+ if ((mask_attrs & mask_allowed) != mask_attrs) {
+ /* Mask attributes check failed. */
+ OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
+ mask_attrs, mask_allowed);
+ return false;
+ }
+
+ return true;
+}
+
+/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
+static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
+ [OVS_KEY_ATTR_ENCAP] = -1,
+ [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
+ [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
+ [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32),
+ [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
+ [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
+ [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
+ [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
+ [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
+ [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
+ [OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16),
+ [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
+ [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
+ [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
+ [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
+ [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
+ [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
+ [OVS_KEY_ATTR_TUNNEL] = -1,
+};
+
+static bool is_all_zero(const u8 *fp, size_t size)
+{
+ int i;
+
+ if (!fp)
+ return false;
+
+ for (i = 0; i < size; i++)
+ if (fp[i])
+ return false;
+
+ return true;
+}
+
+static int __parse_flow_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[],
+ u64 *attrsp, bool nz)
+{
+ const struct nlattr *nla;
+ u64 attrs;
+ int rem;
+
+ attrs = *attrsp;
+ nla_for_each_nested(nla, attr, rem) {
+ u16 type = nla_type(nla);
+ int expected_len;
+
+ if (type > OVS_KEY_ATTR_MAX) {
+ OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n",
+ type, OVS_KEY_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (attrs & (1 << type)) {
+ OVS_NLERR("Duplicate key attribute (type %d).\n", type);
+ return -EINVAL;
+ }
+
+ expected_len = ovs_key_lens[type];
+ if (nla_len(nla) != expected_len && expected_len != -1) {
+ OVS_NLERR("Key attribute has unexpected length (type=%d"
+ ", length=%d, expected=%d).\n", type,
+ nla_len(nla), expected_len);
+ return -EINVAL;
+ }
+
+ if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
+ attrs |= 1 << type;
+ a[type] = nla;
+ }
+ }
+ if (rem) {
+ OVS_NLERR("Message has %d unknown bytes.\n", rem);
+ return -EINVAL;
+ }
+
+ *attrsp = attrs;
+ return 0;
+}
+
+static int parse_flow_mask_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[], u64 *attrsp)
+{
+ return __parse_flow_nlattrs(attr, a, attrsp, true);
+}
+
+static int parse_flow_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[], u64 *attrsp)
+{
+ return __parse_flow_nlattrs(attr, a, attrsp, false);
+}
+
+static int ipv4_tun_from_nlattr(const struct nlattr *attr,
+ struct sw_flow_match *match, bool is_mask)
+{
+ struct nlattr *a;
+ int rem;
+ bool ttl = false;
+ __be16 tun_flags = 0;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
+ [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
+ [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
+ [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
+ [OVS_TUNNEL_KEY_ATTR_TOS] = 1,
+ [OVS_TUNNEL_KEY_ATTR_TTL] = 1,
+ [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
+ [OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
+ };
+
+ if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
+ OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n",
+ type, OVS_TUNNEL_KEY_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (ovs_tunnel_key_lens[type] != nla_len(a)) {
+ OVS_NLERR("IPv4 tunnel attribute type has unexpected "
+ " length (type=%d, length=%d, expected=%d).\n",
+ type, nla_len(a), ovs_tunnel_key_lens[type]);
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case OVS_TUNNEL_KEY_ATTR_ID:
+ SW_FLOW_KEY_PUT(match, tun_key.tun_id,
+ nla_get_be64(a), is_mask);
+ tun_flags |= TUNNEL_KEY;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+ nla_get_be32(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+ nla_get_be32(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TOS:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+ nla_get_u8(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TTL:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+ nla_get_u8(a), is_mask);
+ ttl = true;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
+ tun_flags |= TUNNEL_DONT_FRAGMENT;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_CSUM:
+ tun_flags |= TUNNEL_CSUM;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
+
+ if (rem > 0) {
+ OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem);
+ return -EINVAL;
+ }
+
+ if (!is_mask) {
+ if (!match->key->tun_key.ipv4_dst) {
+ OVS_NLERR("IPv4 tunnel destination address is zero.\n");
+ return -EINVAL;
+ }
+
+ if (!ttl) {
+ OVS_NLERR("IPv4 tunnel TTL not specified.\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ipv4_tun_to_nlattr(struct sk_buff *skb,
+ const struct ovs_key_ipv4_tunnel *tun_key,
+ const struct ovs_key_ipv4_tunnel *output)
+{
+ struct nlattr *nla;
+
+ nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
+ if (!nla)
+ return -EMSGSIZE;
+
+ if (output->tun_flags & TUNNEL_KEY &&
+ nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
+ return -EMSGSIZE;
+ if (output->ipv4_src &&
+ nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
+ return -EMSGSIZE;
+ if (output->ipv4_dst &&
+ nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
+ return -EMSGSIZE;
+ if (output->ipv4_tos &&
+ nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+ return -EMSGSIZE;
+ if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
+ return -EMSGSIZE;
+ if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+ nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
+ return -EMSGSIZE;
+ if ((output->tun_flags & TUNNEL_CSUM) &&
+ nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+
+static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
+ const struct nlattr **a, bool is_mask)
+{
+ if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
+ SW_FLOW_KEY_PUT(match, phy.priority,
+ nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
+ u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
+
+ if (is_mask)
+ in_port = 0xffffffff; /* Always exact match in_port. */
+ else if (in_port >= DP_MAX_PORTS)
+ return -EINVAL;
+
+ SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
+ uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
+
+ SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
+ }
+ if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
+ if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
+ is_mask))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
+ }
+ return 0;
+}
+
+static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
+ const struct nlattr **a, bool is_mask)
+{
+ int err;
+ u64 orig_attrs = attrs;
+
+ err = metadata_from_nlattrs(match, &attrs, a, is_mask);
+ if (err)
+ return err;
+
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
+ const struct ovs_key_ethernet *eth_key;
+
+ eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
+ SW_FLOW_KEY_MEMCPY(match, eth.src,
+ eth_key->eth_src, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, eth.dst,
+ eth_key->eth_dst, ETH_ALEN, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
+ __be16 tci;
+
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ if (is_mask)
+ OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n");
+ else
+ OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n");
+
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+ } else if (!is_mask)
+ SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
+
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
+ __be16 eth_type;
+
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+ if (is_mask) {
+ /* Always exact match EtherType. */
+ eth_type = htons(0xffff);
+ } else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+ OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n",
+ ntohs(eth_type), ETH_P_802_3_MIN);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ const struct ovs_key_ipv4 *ipv4_key;
+
+ ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
+ if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
+ OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n",
+ ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
+ return -EINVAL;
+ }
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ipv4_key->ipv4_proto, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.tos,
+ ipv4_key->ipv4_tos, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.ttl,
+ ipv4_key->ipv4_ttl, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.frag,
+ ipv4_key->ipv4_frag, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+ ipv4_key->ipv4_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+ ipv4_key->ipv4_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
+ const struct ovs_key_ipv6 *ipv6_key;
+
+ ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
+ if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
+ OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n",
+ ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
+ return -EINVAL;
+ }
+ SW_FLOW_KEY_PUT(match, ipv6.label,
+ ipv6_key->ipv6_label, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ipv6_key->ipv6_proto, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.tos,
+ ipv6_key->ipv6_tclass, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.ttl,
+ ipv6_key->ipv6_hlimit, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.frag,
+ ipv6_key->ipv6_frag, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
+ ipv6_key->ipv6_src,
+ sizeof(match->key->ipv6.addr.src),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
+ ipv6_key->ipv6_dst,
+ sizeof(match->key->ipv6.addr.dst),
+ is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
+ const struct ovs_key_arp *arp_key;
+
+ arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
+ if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
+ OVS_NLERR("Unknown ARP opcode (opcode=%d).\n",
+ arp_key->arp_op);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+ arp_key->arp_sip, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+ arp_key->arp_tip, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ntohs(arp_key->arp_op), is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
+ arp_key->arp_sha, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
+ arp_key->arp_tha, ETH_ALEN, is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_ARP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
+ const struct ovs_key_tcp *tcp_key;
+
+ tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
+ if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+ tcp_key->tcp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+ tcp_key->tcp_dst, is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+ tcp_key->tcp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+ tcp_key->tcp_dst, is_mask);
+ }
+ attrs &= ~(1 << OVS_KEY_ATTR_TCP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) {
+ if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ SW_FLOW_KEY_PUT(match, ipv4.tp.flags,
+ nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+ is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, ipv6.tp.flags,
+ nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+ is_mask);
+ }
+ attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
+ const struct ovs_key_udp *udp_key;
+
+ udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
+ if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+ udp_key->udp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+ udp_key->udp_dst, is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+ udp_key->udp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+ udp_key->udp_dst, is_mask);
+ }
+ attrs &= ~(1 << OVS_KEY_ATTR_UDP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
+ const struct ovs_key_sctp *sctp_key;
+
+ sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
+ if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+ sctp_key->sctp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+ sctp_key->sctp_dst, is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+ sctp_key->sctp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+ sctp_key->sctp_dst, is_mask);
+ }
+ attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
+ const struct ovs_key_icmp *icmp_key;
+
+ icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+ htons(icmp_key->icmp_type), is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+ htons(icmp_key->icmp_code), is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
+ const struct ovs_key_icmpv6 *icmpv6_key;
+
+ icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+ htons(icmpv6_key->icmpv6_type), is_mask);
+ SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+ htons(icmpv6_key->icmpv6_code), is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ND)) {
+ const struct ovs_key_nd *nd_key;
+
+ nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
+ nd_key->nd_target,
+ sizeof(match->key->ipv6.nd.target),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
+ nd_key->nd_sll, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
+ nd_key->nd_tll, ETH_ALEN, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ND);
+ }
+
+ if (attrs != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sw_flow_mask_set(struct sw_flow_mask *mask,
+ struct sw_flow_key_range *range, u8 val)
+{
+ u8 *m = (u8 *)&mask->key + range->start;
+
+ mask->range = *range;
+ memset(m, val, range_n_bytes(range));
+}
+
+/**
+ * ovs_nla_get_match - parses Netlink attributes into a flow key and
+ * mask. In case the 'mask' is NULL, the flow is treated as exact match
+ * flow. Otherwise, it is treated as a wildcarded flow, except the mask
+ * does not include any don't care bit.
+ * @match: receives the extracted flow match information.
+ * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence. The fields should of the packet that triggered the creation
+ * of this flow.
+ * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
+ * attribute specifies the mask field of the wildcarded flow.
+ */
+int ovs_nla_get_match(struct sw_flow_match *match,
+ const struct nlattr *key,
+ const struct nlattr *mask)
+{
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ const struct nlattr *encap;
+ u64 key_attrs = 0;
+ u64 mask_attrs = 0;
+ bool encap_valid = false;
+ int err;
+
+ err = parse_flow_nlattrs(key, a, &key_attrs);
+ if (err)
+ return err;
+
+ if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
+ (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
+ __be16 tci;
+
+ if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
+ OVS_NLERR("Invalid Vlan frame.\n");
+ return -EINVAL;
+ }
+
+ key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ encap = a[OVS_KEY_ATTR_ENCAP];
+ key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ encap_valid = true;
+
+ if (tci & htons(VLAN_TAG_PRESENT)) {
+ err = parse_flow_nlattrs(encap, a, &key_attrs);
+ if (err)
+ return err;
+ } else if (!tci) {
+ /* Corner case for truncated 802.1Q header. */
+ if (nla_len(encap)) {
+ OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n");
+ return -EINVAL;
+ }
+ } else {
+ OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n");
+ return -EINVAL;
+ }
+ }
+
+ err = ovs_key_from_nlattrs(match, key_attrs, a, false);
+ if (err)
+ return err;
+
+ if (mask) {
+ err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
+ if (err)
+ return err;
+
+ if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) {
+ __be16 eth_type = 0;
+ __be16 tci = 0;
+
+ if (!encap_valid) {
+ OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n");
+ return -EINVAL;
+ }
+
+ mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ if (a[OVS_KEY_ATTR_ETHERTYPE])
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+ if (eth_type == htons(0xffff)) {
+ mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ encap = a[OVS_KEY_ATTR_ENCAP];
+ err = parse_flow_mask_nlattrs(encap, a, &mask_attrs);
+ } else {
+ OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n",
+ ntohs(eth_type));
+ return -EINVAL;
+ }
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci));
+ return -EINVAL;
+ }
+ }
+
+ err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
+ if (err)
+ return err;
+ } else {
+ /* Populate exact match flow's key mask. */
+ if (match->mask)
+ sw_flow_mask_set(match->mask, &match->range, 0xff);
+ }
+
+ if (!match_validate(match, key_attrs, mask_attrs))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
+ * @flow: Receives extracted in_port, priority, tun_key and skb_mark.
+ * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence.
+ *
+ * This parses a series of Netlink attributes that form a flow key, which must
+ * take the same form accepted by flow_from_nlattrs(), but only enough of it to
+ * get the metadata, that is, the parts of the flow key that cannot be
+ * extracted from the packet itself.
+ */
+
+int ovs_nla_get_flow_metadata(struct sw_flow *flow,
+ const struct nlattr *attr)
+{
+ struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ u64 attrs = 0;
+ int err;
+ struct sw_flow_match match;
+
+ flow->key.phy.in_port = DP_MAX_PORTS;
+ flow->key.phy.priority = 0;
+ flow->key.phy.skb_mark = 0;
+ memset(tun_key, 0, sizeof(flow->key.tun_key));
+
+ err = parse_flow_nlattrs(attr, a, &attrs);
+ if (err)
+ return -EINVAL;
+
+ memset(&match, 0, sizeof(match));
+ match.key = &flow->key;
+
+ err = metadata_from_nlattrs(&match, &attrs, a, false);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int ovs_nla_put_flow(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, struct sk_buff *skb)
+{
+ struct ovs_key_ethernet *eth_key;
+ struct nlattr *nla, *encap;
+ bool is_mask = (swkey != output);
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
+ goto nla_put_failure;
+
+ if ((swkey->tun_key.ipv4_dst || is_mask) &&
+ ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
+ goto nla_put_failure;
+
+ if (swkey->phy.in_port == DP_MAX_PORTS) {
+ if (is_mask && (output->phy.in_port == 0xffff))
+ if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
+ goto nla_put_failure;
+ } else {
+ u16 upper_u16;
+ upper_u16 = !is_mask ? 0 : 0xffff;
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
+ (upper_u16 << 16) | output->phy.in_port))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
+ if (!nla)
+ goto nla_put_failure;
+
+ eth_key = nla_data(nla);
+ memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN);
+ memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN);
+
+ if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
+ __be16 eth_type;
+ eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+ nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
+ goto nla_put_failure;
+ encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.tci)
+ goto unencap;
+ } else
+ encap = NULL;
+
+ if (swkey->eth.type == htons(ETH_P_802_2)) {
+ /*
+ * Ethertype 802.2 is represented in the netlink with omitted
+ * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+ * 0xffff in the mask attribute. Ethertype can also
+ * be wildcarded.
+ */
+ if (is_mask && output->eth.type)
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+ output->eth.type))
+ goto nla_put_failure;
+ goto unencap;
+ }
+
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
+ goto nla_put_failure;
+
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ struct ovs_key_ipv4 *ipv4_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv4_key = nla_data(nla);
+ ipv4_key->ipv4_src = output->ipv4.addr.src;
+ ipv4_key->ipv4_dst = output->ipv4.addr.dst;
+ ipv4_key->ipv4_proto = output->ip.proto;
+ ipv4_key->ipv4_tos = output->ip.tos;
+ ipv4_key->ipv4_ttl = output->ip.ttl;
+ ipv4_key->ipv4_frag = output->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ struct ovs_key_ipv6 *ipv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv6_key = nla_data(nla);
+ memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
+ sizeof(ipv6_key->ipv6_src));
+ memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
+ sizeof(ipv6_key->ipv6_dst));
+ ipv6_key->ipv6_label = output->ipv6.label;
+ ipv6_key->ipv6_proto = output->ip.proto;
+ ipv6_key->ipv6_tclass = output->ip.tos;
+ ipv6_key->ipv6_hlimit = output->ip.ttl;
+ ipv6_key->ipv6_frag = output->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_ARP) ||
+ swkey->eth.type == htons(ETH_P_RARP)) {
+ struct ovs_key_arp *arp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
+ if (!nla)
+ goto nla_put_failure;
+ arp_key = nla_data(nla);
+ memset(arp_key, 0, sizeof(struct ovs_key_arp));
+ arp_key->arp_sip = output->ipv4.addr.src;
+ arp_key->arp_tip = output->ipv4.addr.dst;
+ arp_key->arp_op = htons(output->ip.proto);
+ memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
+ memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
+ }
+
+ if ((swkey->eth.type == htons(ETH_P_IP) ||
+ swkey->eth.type == htons(ETH_P_IPV6)) &&
+ swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
+
+ if (swkey->ip.proto == IPPROTO_TCP) {
+ struct ovs_key_tcp *tcp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
+ if (!nla)
+ goto nla_put_failure;
+ tcp_key = nla_data(nla);
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ tcp_key->tcp_src = output->ipv4.tp.src;
+ tcp_key->tcp_dst = output->ipv4.tp.dst;
+ if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
+ output->ipv4.tp.flags))
+ goto nla_put_failure;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ tcp_key->tcp_src = output->ipv6.tp.src;
+ tcp_key->tcp_dst = output->ipv6.tp.dst;
+ if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
+ output->ipv6.tp.flags))
+ goto nla_put_failure;
+ }
+ } else if (swkey->ip.proto == IPPROTO_UDP) {
+ struct ovs_key_udp *udp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
+ if (!nla)
+ goto nla_put_failure;
+ udp_key = nla_data(nla);
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ udp_key->udp_src = output->ipv4.tp.src;
+ udp_key->udp_dst = output->ipv4.tp.dst;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ udp_key->udp_src = output->ipv6.tp.src;
+ udp_key->udp_dst = output->ipv6.tp.dst;
+ }
+ } else if (swkey->ip.proto == IPPROTO_SCTP) {
+ struct ovs_key_sctp *sctp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
+ if (!nla)
+ goto nla_put_failure;
+ sctp_key = nla_data(nla);
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ sctp_key->sctp_src = swkey->ipv4.tp.src;
+ sctp_key->sctp_dst = swkey->ipv4.tp.dst;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ sctp_key->sctp_src = swkey->ipv6.tp.src;
+ sctp_key->sctp_dst = swkey->ipv6.tp.dst;
+ }
+ } else if (swkey->eth.type == htons(ETH_P_IP) &&
+ swkey->ip.proto == IPPROTO_ICMP) {
+ struct ovs_key_icmp *icmp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmp_key = nla_data(nla);
+ icmp_key->icmp_type = ntohs(output->ipv4.tp.src);
+ icmp_key->icmp_code = ntohs(output->ipv4.tp.dst);
+ } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
+ swkey->ip.proto == IPPROTO_ICMPV6) {
+ struct ovs_key_icmpv6 *icmpv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
+ sizeof(*icmpv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmpv6_key = nla_data(nla);
+ icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src);
+ icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst);
+
+ if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
+ icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
+ struct ovs_key_nd *nd_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
+ if (!nla)
+ goto nla_put_failure;
+ nd_key = nla_data(nla);
+ memcpy(nd_key->nd_target, &output->ipv6.nd.target,
+ sizeof(nd_key->nd_target));
+ memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN);
+ memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN);
+ }
+ }
+ }
+
+unencap:
+ if (encap)
+ nla_nest_end(skb, encap);
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+#define MAX_ACTIONS_BUFSIZE (32 * 1024)
+
+struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size)
+{
+ struct sw_flow_actions *sfa;
+
+ if (size > MAX_ACTIONS_BUFSIZE)
+ return ERR_PTR(-EINVAL);
+
+ sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
+ if (!sfa)
+ return ERR_PTR(-ENOMEM);
+
+ sfa->actions_len = 0;
+ return sfa;
+}
+
+/* RCU callback used by ovs_nla_free_flow_actions. */
+static void rcu_free_acts_callback(struct rcu_head *rcu)
+{
+ struct sw_flow_actions *sf_acts = container_of(rcu,
+ struct sw_flow_actions, rcu);
+ kfree(sf_acts);
+}
+
+/* Schedules 'sf_acts' to be freed after the next RCU grace period.
+ * The caller must hold rcu_read_lock for this to be sensible. */
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
+}
+
+static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
+ int attr_len)
+{
+
+ struct sw_flow_actions *acts;
+ int new_acts_size;
+ int req_size = NLA_ALIGN(attr_len);
+ int next_offset = offsetof(struct sw_flow_actions, actions) +
+ (*sfa)->actions_len;
+
+ if (req_size <= (ksize(*sfa) - next_offset))
+ goto out;
+
+ new_acts_size = ksize(*sfa) * 2;
+
+ if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
+ if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
+ return ERR_PTR(-EMSGSIZE);
+ new_acts_size = MAX_ACTIONS_BUFSIZE;
+ }
+
+ acts = ovs_nla_alloc_flow_actions(new_acts_size);
+ if (IS_ERR(acts))
+ return (void *)acts;
+
+ memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
+ acts->actions_len = (*sfa)->actions_len;
+ kfree(*sfa);
+ *sfa = acts;
+
+out:
+ (*sfa)->actions_len += req_size;
+ return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
+}
+
+static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
+{
+ struct nlattr *a;
+
+ a = reserve_sfa_size(sfa, nla_attr_size(len));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ a->nla_type = attrtype;
+ a->nla_len = nla_attr_size(len);
+
+ if (data)
+ memcpy(nla_data(a), data, len);
+ memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
+
+ return 0;
+}
+
+static inline int add_nested_action_start(struct sw_flow_actions **sfa,
+ int attrtype)
+{
+ int used = (*sfa)->actions_len;
+ int err;
+
+ err = add_action(sfa, attrtype, NULL, 0);
+ if (err)
+ return err;
+
+ return used;
+}
+
+static inline void add_nested_action_end(struct sw_flow_actions *sfa,
+ int st_offset)
+{
+ struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions +
+ st_offset);
+
+ a->nla_len = sfa->actions_len - st_offset;
+}
+
+static int validate_and_copy_sample(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth,
+ struct sw_flow_actions **sfa)
+{
+ const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
+ const struct nlattr *probability, *actions;
+ const struct nlattr *a;
+ int rem, start, err, st_acts;
+
+ memset(attrs, 0, sizeof(attrs));
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
+ return -EINVAL;
+ attrs[type] = a;
+ }
+ if (rem)
+ return -EINVAL;
+
+ probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
+ if (!probability || nla_len(probability) != sizeof(u32))
+ return -EINVAL;
+
+ actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
+ if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
+ return -EINVAL;
+
+ /* validation done, copy sample action. */
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE);
+ if (start < 0)
+ return start;
+ err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
+ nla_data(probability), sizeof(u32));
+ if (err)
+ return err;
+ st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS);
+ if (st_acts < 0)
+ return st_acts;
+
+ err = ovs_nla_copy_actions(actions, key, depth + 1, sfa);
+ if (err)
+ return err;
+
+ add_nested_action_end(*sfa, st_acts);
+ add_nested_action_end(*sfa, start);
+
+ return 0;
+}
+
+static int validate_tp_port(const struct sw_flow_key *flow_key)
+{
+ if (flow_key->eth.type == htons(ETH_P_IP)) {
+ if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst)
+ return 0;
+ } else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
+ if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst)
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+void ovs_match_init(struct sw_flow_match *match,
+ struct sw_flow_key *key,
+ struct sw_flow_mask *mask)
+{
+ memset(match, 0, sizeof(*match));
+ match->key = key;
+ match->mask = mask;
+
+ memset(key, 0, sizeof(*key));
+
+ if (mask) {
+ memset(&mask->key, 0, sizeof(mask->key));
+ mask->range.start = mask->range.end = 0;
+ }
+}
+
+static int validate_and_copy_set_tun(const struct nlattr *attr,
+ struct sw_flow_actions **sfa)
+{
+ struct sw_flow_match match;
+ struct sw_flow_key key;
+ int err, start;
+
+ ovs_match_init(&match, &key, NULL);
+ err = ipv4_tun_from_nlattr(nla_data(attr), &match, false);
+ if (err)
+ return err;
+
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
+ if (start < 0)
+ return start;
+
+ err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
+ sizeof(match.key->tun_key));
+ add_nested_action_end(*sfa, start);
+
+ return err;
+}
+
+static int validate_set(const struct nlattr *a,
+ const struct sw_flow_key *flow_key,
+ struct sw_flow_actions **sfa,
+ bool *set_tun)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ int key_type = nla_type(ovs_key);
+
+ /* There can be only one key in a action */
+ if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
+ return -EINVAL;
+
+ if (key_type > OVS_KEY_ATTR_MAX ||
+ (ovs_key_lens[key_type] != nla_len(ovs_key) &&
+ ovs_key_lens[key_type] != -1))
+ return -EINVAL;
+
+ switch (key_type) {
+ const struct ovs_key_ipv4 *ipv4_key;
+ const struct ovs_key_ipv6 *ipv6_key;
+ int err;
+
+ case OVS_KEY_ATTR_PRIORITY:
+ case OVS_KEY_ATTR_SKB_MARK:
+ case OVS_KEY_ATTR_ETHERNET:
+ break;
+
+ case OVS_KEY_ATTR_TUNNEL:
+ *set_tun = true;
+ err = validate_and_copy_set_tun(a, sfa);
+ if (err)
+ return err;
+ break;
+
+ case OVS_KEY_ATTR_IPV4:
+ if (flow_key->eth.type != htons(ETH_P_IP))
+ return -EINVAL;
+
+ if (!flow_key->ip.proto)
+ return -EINVAL;
+
+ ipv4_key = nla_data(ovs_key);
+ if (ipv4_key->ipv4_proto != flow_key->ip.proto)
+ return -EINVAL;
+
+ if (ipv4_key->ipv4_frag != flow_key->ip.frag)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_IPV6:
+ if (flow_key->eth.type != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ if (!flow_key->ip.proto)
+ return -EINVAL;
+
+ ipv6_key = nla_data(ovs_key);
+ if (ipv6_key->ipv6_proto != flow_key->ip.proto)
+ return -EINVAL;
+
+ if (ipv6_key->ipv6_frag != flow_key->ip.frag)
+ return -EINVAL;
+
+ if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_TCP:
+ if (flow_key->ip.proto != IPPROTO_TCP)
+ return -EINVAL;
+
+ return validate_tp_port(flow_key);
+
+ case OVS_KEY_ATTR_UDP:
+ if (flow_key->ip.proto != IPPROTO_UDP)
+ return -EINVAL;
+
+ return validate_tp_port(flow_key);
+
+ case OVS_KEY_ATTR_SCTP:
+ if (flow_key->ip.proto != IPPROTO_SCTP)
+ return -EINVAL;
+
+ return validate_tp_port(flow_key);
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int validate_userspace(const struct nlattr *attr)
+{
+ static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
+ [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
+ [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
+ };
+ struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
+ int error;
+
+ error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
+ attr, userspace_policy);
+ if (error)
+ return error;
+
+ if (!a[OVS_USERSPACE_ATTR_PID] ||
+ !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int copy_action(const struct nlattr *from,
+ struct sw_flow_actions **sfa)
+{
+ int totlen = NLA_ALIGN(from->nla_len);
+ struct nlattr *to;
+
+ to = reserve_sfa_size(sfa, from->nla_len);
+ if (IS_ERR(to))
+ return PTR_ERR(to);
+
+ memcpy(to, from, totlen);
+ return 0;
+}
+
+int ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ int depth,
+ struct sw_flow_actions **sfa)
+{
+ const struct nlattr *a;
+ int rem, err;
+
+ if (depth >= SAMPLE_ACTION_DEPTH)
+ return -EOVERFLOW;
+
+ nla_for_each_nested(a, attr, rem) {
+ /* Expected argument lengths, (u32)-1 for variable length. */
+ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
+ [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
+ [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+ [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
+ [OVS_ACTION_ATTR_POP_VLAN] = 0,
+ [OVS_ACTION_ATTR_SET] = (u32)-1,
+ [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
+ };
+ const struct ovs_action_push_vlan *vlan;
+ int type = nla_type(a);
+ bool skip_copy;
+
+ if (type > OVS_ACTION_ATTR_MAX ||
+ (action_lens[type] != nla_len(a) &&
+ action_lens[type] != (u32)-1))
+ return -EINVAL;
+
+ skip_copy = false;
+ switch (type) {
+ case OVS_ACTION_ATTR_UNSPEC:
+ return -EINVAL;
+
+ case OVS_ACTION_ATTR_USERSPACE:
+ err = validate_userspace(a);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_OUTPUT:
+ if (nla_get_u32(a) >= DP_MAX_PORTS)
+ return -EINVAL;
+ break;
+
+
+ case OVS_ACTION_ATTR_POP_VLAN:
+ break;
+
+ case OVS_ACTION_ATTR_PUSH_VLAN:
+ vlan = nla_data(a);
+ if (vlan->vlan_tpid != htons(ETH_P_8021Q))
+ return -EINVAL;
+ if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
+ return -EINVAL;
+ break;
+
+ case OVS_ACTION_ATTR_SET:
+ err = validate_set(a, key, sfa, &skip_copy);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = validate_and_copy_sample(a, key, depth, sfa);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ if (!skip_copy) {
+ err = copy_action(a, sfa);
+ if (err)
+ return err;
+ }
+ }
+
+ if (rem > 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
+{
+ const struct nlattr *a;
+ struct nlattr *start;
+ int err = 0, rem;
+
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
+ if (!start)
+ return -EMSGSIZE;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ struct nlattr *st_sample;
+
+ switch (type) {
+ case OVS_SAMPLE_ATTR_PROBABILITY:
+ if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY,
+ sizeof(u32), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ case OVS_SAMPLE_ATTR_ACTIONS:
+ st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
+ if (!st_sample)
+ return -EMSGSIZE;
+ err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
+ if (err)
+ return err;
+ nla_nest_end(skb, st_sample);
+ break;
+ }
+ }
+
+ nla_nest_end(skb, start);
+ return err;
+}
+
+static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ int key_type = nla_type(ovs_key);
+ struct nlattr *start;
+ int err;
+
+ switch (key_type) {
+ case OVS_KEY_ATTR_IPV4_TUNNEL:
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
+ if (!start)
+ return -EMSGSIZE;
+
+ err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
+ nla_data(ovs_key));
+ if (err)
+ return err;
+ nla_nest_end(skb, start);
+ break;
+ default:
+ if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
+ return -EMSGSIZE;
+ break;
+ }
+
+ return 0;
+}
+
+int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
+{
+ const struct nlattr *a;
+ int rem, err;
+
+ nla_for_each_attr(a, attr, len, rem) {
+ int type = nla_type(a);
+
+ switch (type) {
+ case OVS_ACTION_ATTR_SET:
+ err = set_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = sample_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+ default:
+ if (nla_put(skb, type, nla_len(a), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
new file mode 100644
index 000000000000..440151045d39
--- /dev/null
+++ b/net/openvswitch/flow_netlink.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+
+#ifndef FLOW_NETLINK_H
+#define FLOW_NETLINK_H 1
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+
+#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+
+#include "flow.h"
+
+void ovs_match_init(struct sw_flow_match *match,
+ struct sw_flow_key *key, struct sw_flow_mask *mask);
+
+int ovs_nla_put_flow(const struct sw_flow_key *,
+ const struct sw_flow_key *, struct sk_buff *);
+int ovs_nla_get_flow_metadata(struct sw_flow *flow,
+ const struct nlattr *attr);
+int ovs_nla_get_match(struct sw_flow_match *match,
+ const struct nlattr *,
+ const struct nlattr *);
+
+int ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth,
+ struct sw_flow_actions **sfa);
+int ovs_nla_put_actions(const struct nlattr *attr,
+ int len, struct sk_buff *skb);
+
+struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len);
+void ovs_nla_free_flow_actions(struct sw_flow_actions *);
+
+#endif /* flow_netlink.h */
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
new file mode 100644
index 000000000000..e42542706087
--- /dev/null
+++ b/net/openvswitch/flow_table.c
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include "flow.h"
+#include "datapath.h"
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/sctp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+
+#include "datapath.h"
+
+#define TBL_MIN_BUCKETS 1024
+#define REHASH_INTERVAL (10 * 60 * HZ)
+
+static struct kmem_cache *flow_cache;
+
+static u16 range_n_bytes(const struct sw_flow_key_range *range)
+{
+ return range->end - range->start;
+}
+
+void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
+ const struct sw_flow_mask *mask)
+{
+ const long *m = (long *)((u8 *)&mask->key + mask->range.start);
+ const long *s = (long *)((u8 *)src + mask->range.start);
+ long *d = (long *)((u8 *)dst + mask->range.start);
+ int i;
+
+ /* The memory outside of the 'mask->range' are not set since
+ * further operations on 'dst' only uses contents within
+ * 'mask->range'.
+ */
+ for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
+ *d++ = *s++ & *m++;
+}
+
+struct sw_flow *ovs_flow_alloc(void)
+{
+ struct sw_flow *flow;
+
+ flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&flow->lock);
+ flow->sf_acts = NULL;
+ flow->mask = NULL;
+
+ return flow;
+}
+
+int ovs_flow_tbl_count(struct flow_table *table)
+{
+ return table->count;
+}
+
+static struct flex_array *alloc_buckets(unsigned int n_buckets)
+{
+ struct flex_array *buckets;
+ int i, err;
+
+ buckets = flex_array_alloc(sizeof(struct hlist_head),
+ n_buckets, GFP_KERNEL);
+ if (!buckets)
+ return NULL;
+
+ err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
+ if (err) {
+ flex_array_free(buckets);
+ return NULL;
+ }
+
+ for (i = 0; i < n_buckets; i++)
+ INIT_HLIST_HEAD((struct hlist_head *)
+ flex_array_get(buckets, i));
+
+ return buckets;
+}
+
+static void flow_free(struct sw_flow *flow)
+{
+ kfree((struct sf_flow_acts __force *)flow->sf_acts);
+ kmem_cache_free(flow_cache, flow);
+}
+
+static void rcu_free_flow_callback(struct rcu_head *rcu)
+{
+ struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
+
+ flow_free(flow);
+}
+
+static void rcu_free_sw_flow_mask_cb(struct rcu_head *rcu)
+{
+ struct sw_flow_mask *mask = container_of(rcu, struct sw_flow_mask, rcu);
+
+ kfree(mask);
+}
+
+static void flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred)
+{
+ if (!mask)
+ return;
+
+ BUG_ON(!mask->ref_count);
+ mask->ref_count--;
+
+ if (!mask->ref_count) {
+ list_del_rcu(&mask->list);
+ if (deferred)
+ call_rcu(&mask->rcu, rcu_free_sw_flow_mask_cb);
+ else
+ kfree(mask);
+ }
+}
+
+void ovs_flow_free(struct sw_flow *flow, bool deferred)
+{
+ if (!flow)
+ return;
+
+ flow_mask_del_ref(flow->mask, deferred);
+
+ if (deferred)
+ call_rcu(&flow->rcu, rcu_free_flow_callback);
+ else
+ flow_free(flow);
+}
+
+static void free_buckets(struct flex_array *buckets)
+{
+ flex_array_free(buckets);
+}
+
+static void __table_instance_destroy(struct table_instance *ti)
+{
+ int i;
+
+ if (ti->keep_flows)
+ goto skip_flows;
+
+ for (i = 0; i < ti->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head = flex_array_get(ti->buckets, i);
+ struct hlist_node *n;
+ int ver = ti->node_ver;
+
+ hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
+ hlist_del(&flow->hash_node[ver]);
+ ovs_flow_free(flow, false);
+ }
+ }
+
+skip_flows:
+ free_buckets(ti->buckets);
+ kfree(ti);
+}
+
+static struct table_instance *table_instance_alloc(int new_size)
+{
+ struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+
+ if (!ti)
+ return NULL;
+
+ ti->buckets = alloc_buckets(new_size);
+
+ if (!ti->buckets) {
+ kfree(ti);
+ return NULL;
+ }
+ ti->n_buckets = new_size;
+ ti->node_ver = 0;
+ ti->keep_flows = false;
+ get_random_bytes(&ti->hash_seed, sizeof(u32));
+
+ return ti;
+}
+
+int ovs_flow_tbl_init(struct flow_table *table)
+{
+ struct table_instance *ti;
+
+ ti = table_instance_alloc(TBL_MIN_BUCKETS);
+
+ if (!ti)
+ return -ENOMEM;
+
+ rcu_assign_pointer(table->ti, ti);
+ INIT_LIST_HEAD(&table->mask_list);
+ table->last_rehash = jiffies;
+ table->count = 0;
+ return 0;
+}
+
+static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
+{
+ struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
+
+ __table_instance_destroy(ti);
+}
+
+static void table_instance_destroy(struct table_instance *ti, bool deferred)
+{
+ if (!ti)
+ return;
+
+ if (deferred)
+ call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
+ else
+ __table_instance_destroy(ti);
+}
+
+void ovs_flow_tbl_destroy(struct flow_table *table)
+{
+ struct table_instance *ti = ovsl_dereference(table->ti);
+
+ table_instance_destroy(ti, false);
+}
+
+struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
+ u32 *bucket, u32 *last)
+{
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ int ver;
+ int i;
+
+ ver = ti->node_ver;
+ while (*bucket < ti->n_buckets) {
+ i = 0;
+ head = flex_array_get(ti->buckets, *bucket);
+ hlist_for_each_entry_rcu(flow, head, hash_node[ver]) {
+ if (i < *last) {
+ i++;
+ continue;
+ }
+ *last = i + 1;
+ return flow;
+ }
+ (*bucket)++;
+ *last = 0;
+ }
+
+ return NULL;
+}
+
+static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
+{
+ hash = jhash_1word(hash, ti->hash_seed);
+ return flex_array_get(ti->buckets,
+ (hash & (ti->n_buckets - 1)));
+}
+
+static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow)
+{
+ struct hlist_head *head;
+
+ head = find_bucket(ti, flow->hash);
+ hlist_add_head_rcu(&flow->hash_node[ti->node_ver], head);
+}
+
+static void flow_table_copy_flows(struct table_instance *old,
+ struct table_instance *new)
+{
+ int old_ver;
+ int i;
+
+ old_ver = old->node_ver;
+ new->node_ver = !old_ver;
+
+ /* Insert in new table. */
+ for (i = 0; i < old->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head;
+
+ head = flex_array_get(old->buckets, i);
+
+ hlist_for_each_entry(flow, head, hash_node[old_ver])
+ table_instance_insert(new, flow);
+ }
+
+ old->keep_flows = true;
+}
+
+static struct table_instance *table_instance_rehash(struct table_instance *ti,
+ int n_buckets)
+{
+ struct table_instance *new_ti;
+
+ new_ti = table_instance_alloc(n_buckets);
+ if (!new_ti)
+ return NULL;
+
+ flow_table_copy_flows(ti, new_ti);
+
+ return new_ti;
+}
+
+int ovs_flow_tbl_flush(struct flow_table *flow_table)
+{
+ struct table_instance *old_ti;
+ struct table_instance *new_ti;
+
+ old_ti = ovsl_dereference(flow_table->ti);
+ new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ if (!new_ti)
+ return -ENOMEM;
+
+ rcu_assign_pointer(flow_table->ti, new_ti);
+ flow_table->last_rehash = jiffies;
+ flow_table->count = 0;
+
+ table_instance_destroy(old_ti, true);
+ return 0;
+}
+
+static u32 flow_hash(const struct sw_flow_key *key, int key_start,
+ int key_end)
+{
+ u32 *hash_key = (u32 *)((u8 *)key + key_start);
+ int hash_u32s = (key_end - key_start) >> 2;
+
+ /* Make sure number of hash bytes are multiple of u32. */
+ BUILD_BUG_ON(sizeof(long) % sizeof(u32));
+
+ return jhash2(hash_key, hash_u32s, 0);
+}
+
+static int flow_key_start(const struct sw_flow_key *key)
+{
+ if (key->tun_key.ipv4_dst)
+ return 0;
+ else
+ return rounddown(offsetof(struct sw_flow_key, phy),
+ sizeof(long));
+}
+
+static bool cmp_key(const struct sw_flow_key *key1,
+ const struct sw_flow_key *key2,
+ int key_start, int key_end)
+{
+ const long *cp1 = (long *)((u8 *)key1 + key_start);
+ const long *cp2 = (long *)((u8 *)key2 + key_start);
+ long diffs = 0;
+ int i;
+
+ for (i = key_start; i < key_end; i += sizeof(long))
+ diffs |= *cp1++ ^ *cp2++;
+
+ return diffs == 0;
+}
+
+static bool flow_cmp_masked_key(const struct sw_flow *flow,
+ const struct sw_flow_key *key,
+ int key_start, int key_end)
+{
+ return cmp_key(&flow->key, key, key_start, key_end);
+}
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+ struct sw_flow_match *match)
+{
+ struct sw_flow_key *key = match->key;
+ int key_start = flow_key_start(key);
+ int key_end = match->range.end;
+
+ return cmp_key(&flow->unmasked_key, key, key_start, key_end);
+}
+
+static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
+ const struct sw_flow_key *unmasked,
+ struct sw_flow_mask *mask)
+{
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ int key_start = mask->range.start;
+ int key_end = mask->range.end;
+ u32 hash;
+ struct sw_flow_key masked_key;
+
+ ovs_flow_mask_key(&masked_key, unmasked, mask);
+ hash = flow_hash(&masked_key, key_start, key_end);
+ head = find_bucket(ti, hash);
+ hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) {
+ if (flow->mask == mask && flow->hash == hash &&
+ flow_cmp_masked_key(flow, &masked_key,
+ key_start, key_end))
+ return flow;
+ }
+ return NULL;
+}
+
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
+ const struct sw_flow_key *key,
+ u32 *n_mask_hit)
+{
+ struct table_instance *ti = rcu_dereference(tbl->ti);
+ struct sw_flow_mask *mask;
+ struct sw_flow *flow;
+
+ *n_mask_hit = 0;
+ list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
+ (*n_mask_hit)++;
+ flow = masked_flow_lookup(ti, key, mask);
+ if (flow) /* Found */
+ return flow;
+ }
+ return NULL;
+}
+
+int ovs_flow_tbl_num_masks(const struct flow_table *table)
+{
+ struct sw_flow_mask *mask;
+ int num = 0;
+
+ list_for_each_entry(mask, &table->mask_list, list)
+ num++;
+
+ return num;
+}
+
+static struct table_instance *table_instance_expand(struct table_instance *ti)
+{
+ return table_instance_rehash(ti, ti->n_buckets * 2);
+}
+
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
+{
+ struct table_instance *ti = ovsl_dereference(table->ti);
+
+ BUG_ON(table->count == 0);
+ hlist_del_rcu(&flow->hash_node[ti->node_ver]);
+ table->count--;
+}
+
+static struct sw_flow_mask *mask_alloc(void)
+{
+ struct sw_flow_mask *mask;
+
+ mask = kmalloc(sizeof(*mask), GFP_KERNEL);
+ if (mask)
+ mask->ref_count = 0;
+
+ return mask;
+}
+
+static void mask_add_ref(struct sw_flow_mask *mask)
+{
+ mask->ref_count++;
+}
+
+static bool mask_equal(const struct sw_flow_mask *a,
+ const struct sw_flow_mask *b)
+{
+ u8 *a_ = (u8 *)&a->key + a->range.start;
+ u8 *b_ = (u8 *)&b->key + b->range.start;
+
+ return (a->range.end == b->range.end)
+ && (a->range.start == b->range.start)
+ && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
+}
+
+static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
+ const struct sw_flow_mask *mask)
+{
+ struct list_head *ml;
+
+ list_for_each(ml, &tbl->mask_list) {
+ struct sw_flow_mask *m;
+ m = container_of(ml, struct sw_flow_mask, list);
+ if (mask_equal(mask, m))
+ return m;
+ }
+
+ return NULL;
+}
+
+/**
+ * add a new mask into the mask list.
+ * The caller needs to make sure that 'mask' is not the same
+ * as any masks that are already on the list.
+ */
+static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
+ struct sw_flow_mask *new)
+{
+ struct sw_flow_mask *mask;
+ mask = flow_mask_find(tbl, new);
+ if (!mask) {
+ /* Allocate a new mask if none exsits. */
+ mask = mask_alloc();
+ if (!mask)
+ return -ENOMEM;
+ mask->key = new->key;
+ mask->range = new->range;
+ list_add_rcu(&mask->list, &tbl->mask_list);
+ }
+
+ mask_add_ref(mask);
+ flow->mask = mask;
+ return 0;
+}
+
+int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
+ struct sw_flow_mask *mask)
+{
+ struct table_instance *new_ti = NULL;
+ struct table_instance *ti;
+ int err;
+
+ err = flow_mask_insert(table, flow, mask);
+ if (err)
+ return err;
+
+ flow->hash = flow_hash(&flow->key, flow->mask->range.start,
+ flow->mask->range.end);
+ ti = ovsl_dereference(table->ti);
+ table_instance_insert(ti, flow);
+ table->count++;
+
+ /* Expand table, if necessary, to make room. */
+ if (table->count > ti->n_buckets)
+ new_ti = table_instance_expand(ti);
+ else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
+ new_ti = table_instance_rehash(ti, ti->n_buckets);
+
+ if (new_ti) {
+ rcu_assign_pointer(table->ti, new_ti);
+ table_instance_destroy(ti, true);
+ table->last_rehash = jiffies;
+ }
+ return 0;
+}
+
+/* Initializes the flow module.
+ * Returns zero if successful or a negative error code. */
+int ovs_flow_init(void)
+{
+ BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
+ BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
+
+ flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
+ 0, NULL);
+ if (flow_cache == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Uninitializes the flow module. */
+void ovs_flow_exit(void)
+{
+ kmem_cache_destroy(flow_cache);
+}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
new file mode 100644
index 000000000000..fbe45d5ad07d
--- /dev/null
+++ b/net/openvswitch/flow_table.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef FLOW_TABLE_H
+#define FLOW_TABLE_H 1
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+
+#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+
+#include "flow.h"
+
+struct table_instance {
+ struct flex_array *buckets;
+ unsigned int n_buckets;
+ struct rcu_head rcu;
+ int node_ver;
+ u32 hash_seed;
+ bool keep_flows;
+};
+
+struct flow_table {
+ struct table_instance __rcu *ti;
+ struct list_head mask_list;
+ unsigned long last_rehash;
+ unsigned int count;
+};
+
+int ovs_flow_init(void);
+void ovs_flow_exit(void);
+
+struct sw_flow *ovs_flow_alloc(void);
+void ovs_flow_free(struct sw_flow *, bool deferred);
+
+int ovs_flow_tbl_init(struct flow_table *);
+int ovs_flow_tbl_count(struct flow_table *table);
+void ovs_flow_tbl_destroy(struct flow_table *table);
+int ovs_flow_tbl_flush(struct flow_table *flow_table);
+
+int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
+ struct sw_flow_mask *mask);
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
+int ovs_flow_tbl_num_masks(const struct flow_table *table);
+struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table,
+ u32 *bucket, u32 *idx);
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
+ const struct sw_flow_key *,
+ u32 *n_mask_hit);
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+ struct sw_flow_match *match);
+
+void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
+ const struct sw_flow_mask *mask);
+#endif /* flow_table.h */
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index c99dea543d64..a3d6951602db 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -24,8 +24,6 @@
#include <linux/if_tunnel.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
-#include <linux/if_vlan.h>
-#include <linux/in.h>
#include <linux/in_route.h>
#include <linux/inetdevice.h>
#include <linux/jhash.h>
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 98d3edbbc235..729c68763fe7 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -134,7 +134,7 @@ static void do_setup(struct net_device *netdev)
netdev->tx_queue_len = 0;
netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
- NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
+ NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
netdev->vlan_features = netdev->features;
netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 09d93c13cfd6..d21f77d875ba 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -150,15 +150,25 @@ static void free_port_rcu(struct rcu_head *rcu)
ovs_vport_free(vport_from_priv(netdev_vport));
}
-static void netdev_destroy(struct vport *vport)
+void ovs_netdev_detach_dev(struct vport *vport)
{
struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- rtnl_lock();
+ ASSERT_RTNL();
netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
netdev_rx_handler_unregister(netdev_vport->dev);
- netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
+ netdev_upper_dev_unlink(netdev_vport->dev,
+ netdev_master_upper_dev_get(netdev_vport->dev));
dev_set_promiscuity(netdev_vport->dev, -1);
+}
+
+static void netdev_destroy(struct vport *vport)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+
+ rtnl_lock();
+ if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ ovs_netdev_detach_dev(vport);
rtnl_unlock();
call_rcu(&netdev_vport->rcu, free_port_rcu);
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
index dd298b5c5cdb..8df01c1127e5 100644
--- a/net/openvswitch/vport-netdev.h
+++ b/net/openvswitch/vport-netdev.h
@@ -39,5 +39,6 @@ netdev_vport_priv(const struct vport *vport)
}
const char *ovs_netdev_get_name(const struct vport *);
+void ovs_netdev_detach_dev(struct vport *);
#endif /* vport_netdev.h */
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 56e22b74cf96..e797a50ac2be 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -29,7 +29,6 @@
#include <net/ip.h>
#include <net/udp.h>
#include <net/ip_tunnels.h>
-#include <net/udp.h>
#include <net/rtnetlink.h>
#include <net/route.h>
#include <net/dsfield.h>
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index a9dfdda9ed1d..fdc041c57853 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -255,6 +255,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
+ f->time_next_packet = 0ULL;
}
return f;
}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index f6334aa19151..7567e6f1a920 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -279,7 +279,9 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port));
rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
- if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC))
+ if (!laddr->valid || laddr->state == SCTP_ADDR_DEL ||
+ (laddr->state != SCTP_ADDR_SRC &&
+ !asoc->src_out_of_asoc_ok))
continue;
/* Do not compare against v4 addrs */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 319137340d15..e650978daf27 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -390,7 +390,6 @@ int sctp_packet_transmit(struct sctp_packet *packet)
__u8 has_data = 0;
struct dst_entry *dst = tp->dst;
unsigned char *auth = NULL; /* pointer to auth in skb data */
- __u32 cksum_buf_len = sizeof(struct sctphdr);
pr_debug("%s: packet:%p\n", __func__, packet);
@@ -493,7 +492,6 @@ int sctp_packet_transmit(struct sctp_packet *packet)
if (chunk == packet->auth)
auth = skb_tail_pointer(nskb);
- cksum_buf_len += chunk->skb->len;
memcpy(skb_put(nskb, chunk->skb->len),
chunk->skb->data, chunk->skb->len);
@@ -538,12 +536,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
if (!sctp_checksum_disable) {
if (!(dst->dev->features & NETIF_F_SCTP_CSUM) ||
(dst_xfrm(dst) != NULL) || packet->ipfragok) {
- __u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
-
- /* 3) Put the resultant value into the checksum field in the
- * common header, and leave the rest of the bits unchanged.
- */
- sh->checksum = sctp_end_cksum(crc32);
+ sh->checksum = sctp_compute_cksum(nskb, 0);
} else {
/* no need to seed pseudo checksum for SCTP */
nskb->ip_summed = CHECKSUM_PARTIAL;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 666c66842799..1a6eef39ab2f 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -860,7 +860,6 @@ static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds,
(!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK))
return;
- BUG_ON(asoc->peer.primary_path == NULL);
sctp_unhash_established(asoc);
sctp_association_free(asoc);
}
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index c959312c45e3..e2fa133f9fba 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -16,8 +16,8 @@ config X25
if you want that) and the lower level data link layer protocol LAPB
(say Y to "LAPB Data Link Driver" below if you want that).
- You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
- <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
+ You can read more about X.25 at <http://www.sangoma.com/tutorials/x25/> and
+ <http://docwiki.cisco.com/wiki/X.25>.
Information about X.25 for Linux is contained in the files
<file:Documentation/networking/x25.txt> and
<file:Documentation/networking/x25-iface.txt>.
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index 2906d520eea7..ccfdc7115a83 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -141,14 +141,14 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
const int plen = skb->len;
int dlen = IPCOMP_SCRATCH_SIZE;
u8 *start = skb->data;
- const int cpu = get_cpu();
- u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
- struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+ struct crypto_comp *tfm;
+ u8 *scratch;
int err;
local_bh_disable();
+ scratch = *this_cpu_ptr(ipcomp_scratches);
+ tfm = *this_cpu_ptr(ipcd->tfms);
err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
- local_bh_enable();
if (err)
goto out;
@@ -158,13 +158,13 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
}
memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
- put_cpu();
+ local_bh_enable();
pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr));
return 0;
out:
- put_cpu();
+ local_bh_enable();
return err;
}
@@ -220,8 +220,8 @@ static void ipcomp_free_scratches(void)
static void * __percpu *ipcomp_alloc_scratches(void)
{
- int i;
void * __percpu *scratches;
+ int i;
if (ipcomp_scratch_users++)
return ipcomp_scratches;
@@ -233,7 +233,9 @@ static void * __percpu *ipcomp_alloc_scratches(void)
ipcomp_scratches = scratches;
for_each_possible_cpu(i) {
- void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE);
+ void *scratch;
+
+ scratch = vmalloc_node(IPCOMP_SCRATCH_SIZE, cpu_to_node(i));
if (!scratch)
return NULL;
*per_cpu_ptr(scratches, i) = scratch;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 76e1873811d4..9a91f7431c41 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1844,6 +1844,13 @@ static int xdst_queue_output(struct sk_buff *skb)
struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
struct xfrm_policy *pol = xdst->pols[0];
struct xfrm_policy_queue *pq = &pol->polq;
+ const struct sk_buff *fclone = skb + 1;
+
+ if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
+ fclone->fclone == SKB_FCLONE_CLONE)) {
+ kfree_skb(skb);
+ return 0;
+ }
if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
kfree_skb(skb);