From f719e3754ee2f7275437e61a6afd520181fdd43b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 5 Mar 2016 15:03:22 +0200 Subject: ipvs: drop first packet to redirect conntrack Jiri Bohac is reporting for a problem where the attempt to reschedule existing connection to another real server needs proper redirect for the conntrack used by the IPVS connection. For example, when IPVS connection is created to NAT-ed real server we alter the reply direction of conntrack. If we later decide to select different real server we can not alter again the conntrack. And if we expire the old connection, the new connection is left without conntrack. So, the only way to redirect both the IPVS connection and the Netfilter's conntrack is to drop the SYN packet that hits existing connection, to wait for the next jiffie to expire the old connection and its conntrack and to rely on client's retransmission to create new connection as usually. Jiri Bohac provided a fix that drops all SYNs on rescheduling, I extended his patch to do such drops only for connections that use conntrack. Here is the original report from Jiri Bohac: Since commit dc7b3eb900aa ("ipvs: Fix reuse connection if real server is dead"), new connections to dead servers are redistributed immediately to new servers. The old connection is expired using ip_vs_conn_expire_now() which sets the connection timer to expire immediately. However, before the timer callback, ip_vs_conn_expire(), is run to clean the connection's conntrack entry, the new redistributed connection may already be established and its conntrack removed instead. Fix this by dropping the first packet of the new connection instead, like we do when the destination server is not available. The timer will have deleted the old conntrack entry long before the first packet of the new connection is retransmitted. Fixes: dc7b3eb900aa ("ipvs: Fix reuse connection if real server is dead") Signed-off-by: Jiri Bohac Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0816c872b689..a6cc576fd467 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1588,6 +1588,23 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) } #endif /* CONFIG_IP_VS_NFCT */ +/* Really using conntrack? */ +static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp, + struct sk_buff *skb) +{ +#ifdef CONFIG_IP_VS_NFCT + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + return false; + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct)) + return true; +#endif + return false; +} + static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { -- cgit v1.2.3 From bfa3f9d7f3b349acea8982d2248e33a0ed84c687 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:16 -0800 Subject: netfilter: Remove IP_CT_NEW_REPLY definition. Remove the definition of IP_CT_NEW_REPLY from the kernel as it does not make sense. This allows the definition of IP_CT_NUMBER to be simplified as well. Signed-off-by: Jarno Rajahalme Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 319f47128db8..6d074d14ee27 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -20,9 +20,15 @@ enum ip_conntrack_info { IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY, IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY, - IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY, - /* Number of distinct IP_CT types (no NEW in reply dirn). */ - IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1 + /* No NEW in reply direction. */ + + /* Number of distinct IP_CT types. */ + IP_CT_NUMBER, + + /* only for userspace compatibility */ +#ifndef __KERNEL__ + IP_CT_NEW_REPLY = IP_CT_NUMBER, +#endif }; #define NF_CT_STATE_INVALID_BIT (1 << 0) -- cgit v1.2.3 From 05752523e56502cd9975aec0a2ded465d51a71f3 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:23 -0800 Subject: openvswitch: Interface with NAT. Extend OVS conntrack interface to cover NAT. New nested OVS_CT_ATTR_NAT attribute may be used to include NAT with a CT action. A bare OVS_CT_ATTR_NAT only mangles existing and expected connections. If OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested attributes, new (non-committed/non-confirmed) connections are mangled according to the rest of the nested attributes. The corresponding OVS userspace patch series includes test cases (in tests/system-traffic.at) that also serve as example uses. This work extends on a branch by Thomas Graf at https://github.com/tgraf/ovs/tree/nat. Signed-off-by: Jarno Rajahalme Acked-by: Thomas Graf Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/openvswitch.h | 49 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a27222d5b413..616d04761730 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -454,6 +454,14 @@ struct ovs_key_ct_labels { #define OVS_CS_F_REPLY_DIR 0x08 /* Flow is in the reply direction. */ #define OVS_CS_F_INVALID 0x10 /* Could not track connection. */ #define OVS_CS_F_TRACKED 0x20 /* Conntrack has occurred. */ +#define OVS_CS_F_SRC_NAT 0x40 /* Packet's source address/port was + * mangled by NAT. + */ +#define OVS_CS_F_DST_NAT 0x80 /* Packet's destination address/port + * was mangled by NAT. + */ + +#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT) /** * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands. @@ -632,6 +640,8 @@ struct ovs_action_hash { * mask. For each bit set in the mask, the corresponding bit in the value is * copied to the connection tracking label field in the connection. * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG. + * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address + * translation (NAT) on the packet. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, @@ -641,11 +651,50 @@ enum ovs_ct_attr { OVS_CT_ATTR_LABELS, /* labels to associate with this connection. */ OVS_CT_ATTR_HELPER, /* netlink helper to assist detection of related connections. */ + OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */ __OVS_CT_ATTR_MAX }; #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1) +/** + * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT. + * + * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port). + * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination + * address/port). Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be + * specified. Effective only for packets for ct_state NEW connections. + * Packets of committed connections are mangled by the NAT action according to + * the committed NAT type regardless of the flags specified. As a corollary, a + * NAT action without a NAT type flag will only mangle packets of committed + * connections. The following NAT attributes only apply for NEW + * (non-committed) connections, and they may be included only when the CT + * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or + * @OVS_NAT_ATTR_DST is also included. + * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr + * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr + * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port) + * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port) + * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots + * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5) + * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping + */ +enum ovs_nat_attr { + OVS_NAT_ATTR_UNSPEC, + OVS_NAT_ATTR_SRC, + OVS_NAT_ATTR_DST, + OVS_NAT_ATTR_IP_MIN, + OVS_NAT_ATTR_IP_MAX, + OVS_NAT_ATTR_PROTO_MIN, + OVS_NAT_ATTR_PROTO_MAX, + OVS_NAT_ATTR_PERSISTENT, + OVS_NAT_ATTR_PROTO_HASH, + OVS_NAT_ATTR_PROTO_RANDOM, + __OVS_NAT_ATTR_MAX, +}; + +#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1) + /** * enum ovs_action_attr - Action types. * -- cgit v1.2.3