From 681f130f39e10087475383e6771b9366e26bab0c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jun 2013 05:52:22 -0700 Subject: netfilter: xt_socket: add XT_SOCKET_NOWILDCARD flag xt_socket module can be a nice replacement to conntrack module in some cases (SYN filtering for example) But it lacks the ability to match the 3rd packet of TCP handshake (ACK coming from the client). Add a XT_SOCKET_NOWILDCARD flag to disable the wildcard mechanism. The wildcard is the legacy socket match behavior, that ignores LISTEN sockets bound to INADDR_ANY (or ipv6 equivalent) iptables -I INPUT -p tcp --syn -j SYN_CHAIN iptables -I INPUT -m socket --nowildcard -j ACCEPT Signed-off-by: Eric Dumazet Cc: Patrick McHardy Cc: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_socket.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/xt_socket.h b/include/uapi/linux/netfilter/xt_socket.h index 26d7217bd4f1..6315e2ac3474 100644 --- a/include/uapi/linux/netfilter/xt_socket.h +++ b/include/uapi/linux/netfilter/xt_socket.h @@ -5,10 +5,17 @@ enum { XT_SOCKET_TRANSPARENT = 1 << 0, + XT_SOCKET_NOWILDCARD = 1 << 1, }; struct xt_socket_mtinfo1 { __u8 flags; }; +#define XT_SOCKET_FLAGS_V1 XT_SOCKET_TRANSPARENT + +struct xt_socket_mtinfo2 { + __u8 flags; +}; +#define XT_SOCKET_FLAGS_V2 (XT_SOCKET_TRANSPARENT | XT_SOCKET_NOWILDCARD) #endif /* _XT_SOCKET_H */ -- cgit v1.2.3 From bba54de5bdd107d3841b560f1a9cb0ed06e79533 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 16 Jun 2013 09:09:36 +0300 Subject: ipvs: provide iph to schedulers Before now the schedulers needed access only to IP addresses and it was easy to get them from skb by using ip_vs_fill_iph_addr_only. New changes for the SH scheduler will need the protocol and ports which is difficult to get from skb for the IPv6 case. As we have all the data in the iph structure, to avoid the same slow lookups provide the iph to schedulers. Signed-off-by: Julian Anastasov Acked-by: Hans Schillstrom Signed-off-by: Simon Horman --- include/net/ip_vs.h | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4405886980c7..f5faf859876e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -197,31 +197,6 @@ ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, struct ip_vs_iphdr *iphdr) } } -/* This function is a faster version of ip_vs_fill_iph_skb(). - * Where we only populate {s,d}addr (and avoid calling ipv6_find_hdr()). - * This is used by the some of the ip_vs_*_schedule() functions. - * (Mostly done to avoid ABI breakage of external schedulers) - */ -static inline void -ip_vs_fill_iph_addr_only(int af, const struct sk_buff *skb, - struct ip_vs_iphdr *iphdr) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - const struct ipv6hdr *iph = - (struct ipv6hdr *)skb_network_header(skb); - iphdr->saddr.in6 = iph->saddr; - iphdr->daddr.in6 = iph->daddr; - } else -#endif - { - const struct iphdr *iph = - (struct iphdr *)skb_network_header(skb); - iphdr->saddr.ip = iph->saddr; - iphdr->daddr.ip = iph->daddr; - } -} - static inline void ip_vs_addr_copy(int af, union nf_inet_addr *dst, const union nf_inet_addr *src) { @@ -814,7 +789,8 @@ struct ip_vs_scheduler { /* selecting a server from the given service */ struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, - const struct sk_buff *skb); + const struct sk_buff *skb, + struct ip_vs_iphdr *iph); }; /* The persistence engine object */ -- cgit v1.2.3 From c6c96c188336b2b95d5f14facd101f1e4165a9d3 Mon Sep 17 00:00:00 2001 From: Alexander Frolkin Date: Thu, 13 Jun 2013 08:56:15 +0100 Subject: ipvs: sloppy TCP and SCTP This adds support for sloppy TCP and SCTP modes to IPVS. When enabled (sysctls net.ipv4.vs.sloppy_tcp and net.ipv4.vs.sloppy_sctp), allows IPVS to create connection state on any packet, not just a TCP SYN (or SCTP INIT). This allows connections to fail over from one IPVS director to another mid-flight. Signed-off-by: Alexander Frolkin Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index f5faf859876e..95860dfdfbe3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -978,6 +978,8 @@ struct netns_ipvs { int sysctl_sync_sock_size; int sysctl_cache_bypass; int sysctl_expire_nodest_conn; + int sysctl_sloppy_tcp; + int sysctl_sloppy_sctp; int sysctl_expire_quiescent_template; int sysctl_sync_threshold[2]; unsigned int sysctl_sync_refresh_period; @@ -1020,6 +1022,8 @@ struct netns_ipvs { #define DEFAULT_SYNC_THRESHOLD 3 #define DEFAULT_SYNC_PERIOD 50 #define DEFAULT_SYNC_VER 1 +#define DEFAULT_SLOPPY_TCP 0 +#define DEFAULT_SLOPPY_SCTP 0 #define DEFAULT_SYNC_REFRESH_PERIOD (0U * HZ) #define DEFAULT_SYNC_RETRIES 0 #define IPVS_SYNC_WAKEUP_RATE 8 @@ -1056,6 +1060,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) return ipvs->sysctl_sync_ver; } +static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_sloppy_tcp; +} + +static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_sloppy_sctp; +} + static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { return ACCESS_ONCE(ipvs->sysctl_sync_ports); @@ -1109,6 +1123,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) return DEFAULT_SYNC_VER; } +static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs) +{ + return DEFAULT_SLOPPY_TCP; +} + +static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) +{ + return DEFAULT_SLOPPY_SCTP; +} + static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { return 1; -- cgit v1.2.3 From 61e7c420b4b2a797ac209106ba743ab6ebe984d8 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 18 Jun 2013 10:08:07 +0300 Subject: ipvs: replace the SCTP state machine Convert the SCTP state table, so that it is more readable. Change the states to be according to the diagram in RFC 2960 and add more states suitable for middle box. Still, such change in states adds incompatibility if systems in sync setup include this change and others do not include it. With this change we also have proper transitions in INPUT-ONLY mode (DR/TUN) where we see packets only from client. Now we should not switch to 10-second CLOSED state at a time when we should stay in ESTABLISHED state. The short names for states are because we have 16-char space in ipvsadm and 11-char limit for the connection list format. It is a sequence of the TCP implementation where the longest state name is ESTABLISHED. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 95860dfdfbe3..e667df171003 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -380,17 +380,18 @@ enum { */ enum ip_vs_sctp_states { IP_VS_SCTP_S_NONE, - IP_VS_SCTP_S_INIT_CLI, - IP_VS_SCTP_S_INIT_SER, - IP_VS_SCTP_S_INIT_ACK_CLI, - IP_VS_SCTP_S_INIT_ACK_SER, - IP_VS_SCTP_S_ECHO_CLI, - IP_VS_SCTP_S_ECHO_SER, + IP_VS_SCTP_S_INIT1, + IP_VS_SCTP_S_INIT, + IP_VS_SCTP_S_COOKIE_SENT, + IP_VS_SCTP_S_COOKIE_REPLIED, + IP_VS_SCTP_S_COOKIE_WAIT, + IP_VS_SCTP_S_COOKIE, + IP_VS_SCTP_S_COOKIE_ECHOED, IP_VS_SCTP_S_ESTABLISHED, - IP_VS_SCTP_S_SHUT_CLI, - IP_VS_SCTP_S_SHUT_SER, - IP_VS_SCTP_S_SHUT_ACK_CLI, - IP_VS_SCTP_S_SHUT_ACK_SER, + IP_VS_SCTP_S_SHUTDOWN_SENT, + IP_VS_SCTP_S_SHUTDOWN_RECEIVED, + IP_VS_SCTP_S_SHUTDOWN_ACK_SENT, + IP_VS_SCTP_S_REJECTED, IP_VS_SCTP_S_CLOSED, IP_VS_SCTP_S_LAST }; -- cgit v1.2.3 From eba3b5a78799d21dea05118b294524958f0ab592 Mon Sep 17 00:00:00 2001 From: Alexander Frolkin Date: Wed, 19 Jun 2013 10:54:25 +0100 Subject: ipvs: SH fallback and L4 hashing By default the SH scheduler rejects connections that are hashed onto a realserver of weight 0. This patch adds a flag to make SH choose a different realserver in this case, instead of rejecting the connection. The patch also adds a flag to make SH include the source port (TCP, UDP, SCTP) in the hash as well as the source address. This basically allows for deterministic round-robin load balancing (i.e., where any director in a cluster of directors with identical config will send the same packet the same way). The flags are service flags (IP_VS_SVC_F_SCHED*) so that these options can be set per service. They are set using a new option to ipvsadm. Signed-off-by: Alexander Frolkin Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/uapi/linux/ip_vs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index a24537725e80..29458223d044 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -20,6 +20,12 @@ #define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */ #define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */ #define IP_VS_SVC_F_ONEPACKET 0x0004 /* one-packet scheduling */ +#define IP_VS_SVC_F_SCHED1 0x0008 /* scheduler flag 1 */ +#define IP_VS_SVC_F_SCHED2 0x0010 /* scheduler flag 2 */ +#define IP_VS_SVC_F_SCHED3 0x0020 /* scheduler flag 3 */ + +#define IP_VS_SVC_F_SCHED_SH_FALLBACK IP_VS_SVC_F_SCHED1 /* SH fallback */ +#define IP_VS_SVC_F_SCHED_SH_PORT IP_VS_SVC_F_SCHED2 /* SH use port */ /* * Destination Server Flags -- cgit v1.2.3 From 4d0c875dcc4923476f364e83912d134da2df224c Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 24 Jun 2013 22:44:41 +0300 Subject: ipvs: add sync_persist_mode flag Add sync_persist_mode flag to reduce sync traffic by syncing only persistent templates. Signed-off-by: Julian Anastasov Tested-by: Aleksey Chudov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index e667df171003..f0d70f066f3d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -975,6 +975,7 @@ struct netns_ipvs { int sysctl_snat_reroute; int sysctl_sync_ver; int sysctl_sync_ports; + int sysctl_sync_persist_mode; unsigned long sysctl_sync_qlen_max; int sysctl_sync_sock_size; int sysctl_cache_bypass; @@ -1076,6 +1077,11 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return ACCESS_ONCE(ipvs->sysctl_sync_ports); } +static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_sync_persist_mode; +} + static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_qlen_max; @@ -1139,6 +1145,11 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return 1; } +static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) +{ + return 0; +} + static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return IPVS_SYNC_QLEN_MAX; -- cgit v1.2.3 From 8b4d14d8eb36874daf159d33dcccd4746a6f3189 Mon Sep 17 00:00:00 2001 From: JunweiZhang Date: Wed, 26 Jun 2013 16:40:06 +0800 Subject: netns: exclude ipvs from struct net when IPVS disabled no real problem is fixed, just save a few bytes in net_namespace structure. Signed-off-by: JunweiZhang Signed-off-by: Nicolas Dichtel Reviewed-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/net_namespace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 495bc57f292c..84e37b1ca9e1 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -115,7 +115,9 @@ struct net { #ifdef CONFIG_XFRM struct netns_xfrm xfrm; #endif +#if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; +#endif struct sock *diag_nlsk; atomic_t rt_genid; atomic_t fnhe_genid; -- cgit v1.2.3 From 496e4ae7dc944faa1721bfda7e9d834d5611a874 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 29 Jun 2013 14:15:47 +0200 Subject: netfilter: nf_queue: add NFQA_SKB_CSUM_NOTVERIFIED info flag The common case is that TCP/IP checksums have already been verified, e.g. by hardware (rx checksum offload), or conntrack. Userspace can use this flag to determine when the checksum has not been validated yet. If the flag is set, this doesn't necessarily mean that the packet has an invalid checksum, e.g. if NIC doesn't support rx checksum. Userspace that sucessfully enabled NFQA_CFG_F_GSO queue feature flag can infer that IP/TCP checksum has already been validated if either the SKB_INFO attribute is not present or the NFQA_SKB_CSUM_NOTVERIFIED flag is unset. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_queue.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h index a2308ae5a73d..3a9b92147339 100644 --- a/include/uapi/linux/netfilter/nfnetlink_queue.h +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h @@ -105,5 +105,7 @@ enum nfqnl_attr_config { #define NFQA_SKB_CSUMNOTREADY (1 << 0) /* packet is GSO (i.e., exceeds device mtu) */ #define NFQA_SKB_GSO (1 << 1) +/* csum not validated (incoming device doesn't support hw checksum, etc.) */ +#define NFQA_SKB_CSUM_NOTVERIFIED (1 << 2) #endif /* _NFNETLINK_QUEUE_H */ -- cgit v1.2.3