From d2bf2f34cc1a8304a5dab0d42e7a2ae58ede94cd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Feb 2014 15:25:32 +0100 Subject: netfilter: nft_ct: labels get support This also adds NF_CT_LABELS_MAX_SIZE so it can be re-used as BUILD_BUG_ON in nft_ct. At this time, nft doesn't yet support writing to the label area; when this changes the label->words handling needs to be moved out of xt_connlabel.c into nf_conntrack_labels.c. Also removes a useless run-time check: words cannot grow beyond 4 (32 bit) or 2 (64bit) since xt_connlabel enforces a maximum of 128 labels. Signed-off-by: Florian Westphal Acked-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_labels.h | 4 +++- include/uapi/linux/netfilter/nf_tables.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index c985695283b3..dec6336bf850 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -7,6 +7,8 @@ #include +#define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE) + struct nf_conn_labels { u8 words; unsigned long bits[]; @@ -29,7 +31,7 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct) u8 words; words = ACCESS_ONCE(net->ct.label_words); - if (words == 0 || WARN_ON_ONCE(words > 8)) + if (words == 0) return NULL; cl_ext = nf_ct_ext_add_length(ct, NF_CT_EXT_LABELS, diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 83c985a6170b..c84c452c62a7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -601,6 +601,7 @@ enum nft_ct_keys { NFT_CT_PROTOCOL, NFT_CT_PROTO_SRC, NFT_CT_PROTO_DST, + NFT_CT_LABELS, }; /** -- cgit v1.2.3 From 0eb5db7ad302a24fe6f0eb4bfd235357047a28db Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 18 Feb 2014 18:06:48 +0000 Subject: netfilter: nfnetlink: add rcu_dereference_protected() helpers Add a lockdep_nfnl_is_held() function and a nfnl_dereference() macro for RCU dereferences protected by a NFNL subsystem mutex. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 28c74367e900..e955d4730625 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -44,6 +44,27 @@ int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, void nfnl_lock(__u8 subsys_id); void nfnl_unlock(__u8 subsys_id); +#ifdef CONFIG_PROVE_LOCKING +int lockdep_nfnl_is_held(__u8 subsys_id); +#else +static inline int lockdep_nfnl_is_held(__u8 subsys_id) +{ + return 1; +} +#endif /* CONFIG_PROVE_LOCKING */ + +/* + * nfnl_dereference - fetch RCU pointer when updates are prevented by subsys mutex + * + * @p: The pointer to read, prior to dereferencing + * @ss: The nfnetlink subsystem ID + * + * Return the value of the specified RCU-protected pointer, but omit + * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because + * caller holds the NFNL subsystem mutex. + */ +#define nfnl_dereference(p, ss) \ + rcu_dereference_protected(p, lockdep_nfnl_is_held(ss)) #define MODULE_ALIAS_NFNL_SUBSYS(subsys) \ MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys)) -- cgit v1.2.3 From 67a8fc27cca06e185c1ab39baaccd2103f6f9f51 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 18 Feb 2014 18:06:49 +0000 Subject: netfilter: nf_tables: add nft_dereference() macro Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e7e14ffe0f6a..81abd61500f4 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -521,6 +522,9 @@ void nft_unregister_chain_type(const struct nf_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); +#define nft_dereference(p) \ + nfnl_dereference(p, NFNL_SUBSYS_NFTABLES) + #define MODULE_ALIAS_NFT_FAMILY(family) \ MODULE_ALIAS("nft-afinfo-" __stringify(family)) -- cgit v1.2.3 From 0768b3b3d228c5acf2075f40f3d25cda30011d4f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 19 Feb 2014 17:27:06 +0100 Subject: netfilter: nf_tables: add optional user data area to rules This allows us to store user comment strings, but it could be also used to store any kind of information that the user application needs to link to the rule. Scratch 8 bits for the new ulen field that indicates the length the user data area. 4 bits from the handle (so it's 42 bits long, according to Patrick, it would last 139 years with 1000 new rules per second) and 4 bits from dlen (so the expression data area is 4K, which seems sufficient by now even considering the compatibility layer). Signed-off-by: Pablo Neira Ayuso Acked-by: Patrick McHardy --- include/net/netfilter/nf_tables.h | 11 +++++++++-- include/uapi/linux/netfilter/nf_tables.h | 5 ++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 81abd61500f4..5af56da6d6c6 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -326,13 +326,15 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data + * @ulen: length of user data (used for comments) * @data: expression data */ struct nft_rule { struct list_head list; - u64 handle:46, + u64 handle:42, genmask:2, - dlen:16; + dlen:12, + ulen:8; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; @@ -371,6 +373,11 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) return (struct nft_expr *)&rule->data[rule->dlen]; } +static inline void *nft_userdata(const struct nft_rule *rule) +{ + return (void *)&rule->data[rule->dlen]; +} + /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c84c452c62a7..c88ccbfda5f1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1,7 +1,8 @@ #ifndef _LINUX_NF_TABLES_H #define _LINUX_NF_TABLES_H -#define NFT_CHAIN_MAXNAMELEN 32 +#define NFT_CHAIN_MAXNAMELEN 32 +#define NFT_USERDATA_MAXLEN 256 enum nft_registers { NFT_REG_VERDICT, @@ -156,6 +157,7 @@ enum nft_chain_attributes { * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes) * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes) * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64) + * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN) */ enum nft_rule_attributes { NFTA_RULE_UNSPEC, @@ -165,6 +167,7 @@ enum nft_rule_attributes { NFTA_RULE_EXPRESSIONS, NFTA_RULE_COMPAT, NFTA_RULE_POSITION, + NFTA_RULE_USERDATA, __NFTA_RULE_MAX }; #define NFTA_RULE_MAX (__NFTA_RULE_MAX - 1) -- cgit v1.2.3 From 3b02b56cd5988d569731f6c0c26992296e46b758 Mon Sep 17 00:00:00 2001 From: Vytas Dauksa Date: Tue, 17 Dec 2013 14:01:43 +0000 Subject: netfilter: ipset: add hash:ip,mark data type to ipset Introduce packet mark support with new ip,mark hash set. This includes userspace and kernelspace code, hash:ip,mark set tests and man page updates. The intended use of ip,mark set is similar to the ip:port type, but for protocols which don't use a predictable port number. Instead of port number it matches a firewall mark determined by a layer 7 filtering program like opendpi. As well as allowing or blocking traffic it will also be used for accounting packets and bytes sent for each protocol. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 10 ++++++---- include/uapi/linux/netfilter/ipset/ip_set.h | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 0c7d01eae56c..4ac00d4aa87e 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -39,11 +39,13 @@ enum ip_set_feature { IPSET_TYPE_NAME = (1 << IPSET_TYPE_NAME_FLAG), IPSET_TYPE_IFACE_FLAG = 5, IPSET_TYPE_IFACE = (1 << IPSET_TYPE_IFACE_FLAG), - IPSET_TYPE_NOMATCH_FLAG = 6, + IPSET_TYPE_MARK_FLAG = 6, + IPSET_TYPE_MARK = (1 << IPSET_TYPE_MARK_FLAG), + IPSET_TYPE_NOMATCH_FLAG = 7, IPSET_TYPE_NOMATCH = (1 << IPSET_TYPE_NOMATCH_FLAG), /* Strictly speaking not a feature, but a flag for dumping: * this settype must be dumped last */ - IPSET_DUMP_LAST_FLAG = 7, + IPSET_DUMP_LAST_FLAG = 8, IPSET_DUMP_LAST = (1 << IPSET_DUMP_LAST_FLAG), }; @@ -171,8 +173,6 @@ struct ip_set_type { char name[IPSET_MAXNAMELEN]; /* Protocol version */ u8 protocol; - /* Set features to control swapping */ - u8 features; /* Set type dimension */ u8 dimension; /* @@ -182,6 +182,8 @@ struct ip_set_type { u8 family; /* Type revisions */ u8 revision_min, revision_max; + /* Set features to control swapping */ + u16 features; /* Create set */ int (*create)(struct net *net, struct ip_set *set, diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 25d3b2f79c02..5368f8275774 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -82,6 +82,7 @@ enum { IPSET_ATTR_PROTO, /* 7 */ IPSET_ATTR_CADT_FLAGS, /* 8 */ IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO, /* 9 */ + IPSET_ATTR_MARK, /* 10 */ /* Reserve empty slots */ IPSET_ATTR_CADT_MAX = 16, /* Create-only specific attributes */ -- cgit v1.2.3 From 4d0e5c076d01d3fb4767a502a9517923fb9a080e Mon Sep 17 00:00:00 2001 From: Vytas Dauksa Date: Tue, 17 Dec 2013 14:01:44 +0000 Subject: netfilter: ipset: add markmask for hash:ip,mark data type Introduce packet mark mask for hash:ip,mark data type. This allows to set mark bit filter for the ip set. Change-Id: Id8dd9ca7e64477c4f7b022a1d9c1a5b187f1c96e Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/ipset/ip_set.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 5368f8275774..f636f282b142 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -89,6 +89,7 @@ enum { IPSET_ATTR_GC, IPSET_ATTR_HASHSIZE, IPSET_ATTR_MAXELEM, + IPSET_ATTR_MARKMASK, IPSET_ATTR_NETMASK, IPSET_ATTR_PROBES, IPSET_ATTR_RESIZE, @@ -138,6 +139,7 @@ enum ipset_errno { IPSET_ERR_EXIST, IPSET_ERR_INVALID_CIDR, IPSET_ERR_INVALID_NETMASK, + IPSET_ERR_INVALID_MARKMASK, IPSET_ERR_INVALID_FAMILY, IPSET_ERR_TIMEOUT, IPSET_ERR_REFERENCED, -- cgit v1.2.3 From af284ece87365f3a69723f5bcc1bcdb505b5eb5d Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Thu, 13 Feb 2014 12:19:56 +0100 Subject: netfilter: ipset: Prepare the kernel for create option flags when no extension is needed Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 2 ++ include/uapi/linux/netfilter/ipset/ip_set.h | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 4ac00d4aa87e..f476bcec25ea 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -219,6 +219,8 @@ struct ip_set { u8 revision; /* Extensions */ u8 extensions; + /* Create flags */ + u8 flags; /* Default timeout value, if enabled */ u32 timeout; /* Element data size */ diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index f636f282b142..a29a378701d2 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -188,6 +188,12 @@ enum ipset_cadt_flags { IPSET_FLAG_CADT_MAX = 15, }; +/* The flag bits which correspond to the non-extension create flags */ +enum ipset_create_flags { + IPSET_CREATE_FLAG_NONE = 0, + IPSET_CREATE_FLAG_MAX = 7, +}; + /* Commands with settype-specific attributes */ enum ipset_adt { IPSET_ADD, -- cgit v1.2.3 From 004088768b78f69002f03a341597217eb608fb2c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Feb 2014 12:40:59 +0100 Subject: netfilter: ipset: kernel: uapi: fix MARKMASK attr ABI breakage commit 2dfb973c0dcc6d2211 (add markmask for hash:ip,mark data type) inserted IPSET_ATTR_MARKMASK in-between other enum values, i.e. changing values of all further attributes. This causes 'ipset list' segfault on existing kernels since ipset no longer finds IPSET_ATTR_MEMSIZE (it has a different value on kernel side). Jozsef points out it should be moved below IPSET_ATTR_MARK which works since there is some extra reserved space after that value. Signed-off-by: Florian Westphal Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/ipset/ip_set.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index a29a378701d2..a1ca24408206 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -83,13 +83,13 @@ enum { IPSET_ATTR_CADT_FLAGS, /* 8 */ IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO, /* 9 */ IPSET_ATTR_MARK, /* 10 */ + IPSET_ATTR_MARKMASK, /* 11 */ /* Reserve empty slots */ IPSET_ATTR_CADT_MAX = 16, /* Create-only specific attributes */ IPSET_ATTR_GC, IPSET_ATTR_HASHSIZE, IPSET_ATTR_MAXELEM, - IPSET_ATTR_MARKMASK, IPSET_ATTR_NETMASK, IPSET_ATTR_PROBES, IPSET_ATTR_RESIZE, @@ -139,7 +139,6 @@ enum ipset_errno { IPSET_ERR_EXIST, IPSET_ERR_INVALID_CIDR, IPSET_ERR_INVALID_NETMASK, - IPSET_ERR_INVALID_MARKMASK, IPSET_ERR_INVALID_FAMILY, IPSET_ERR_TIMEOUT, IPSET_ERR_REFERENCED, @@ -147,6 +146,7 @@ enum ipset_errno { IPSET_ERR_IPADDR_IPV6, IPSET_ERR_COUNTER, IPSET_ERR_COMMENT, + IPSET_ERR_INVALID_MARKMASK, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, -- cgit v1.2.3 From 07cf8f5ae2657ac495b906c68ff3441ff8ba80ba Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Fri, 28 Feb 2014 22:14:57 -0500 Subject: netfilter: ipset: add forceadd kernel support for hash set types Adds a new property for hash set types, where if a set is created with the 'forceadd' option and the set becomes full the next addition to the set may succeed and evict a random entry from the set. To keep overhead low eviction is done very simply. It checks to see which bucket the new entry would be added. If the bucket's pos value is non-zero (meaning there's at least one entry in the bucket) it replaces the first entry in the bucket. If pos is zero, then it continues down the normal add process. This property is useful if you have a set for 'ban' lists where it may not matter if you release some entries from the set early. Signed-off-by: Josh Hunt Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 3 +++ include/uapi/linux/netfilter/ipset/ip_set.h | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index f476bcec25ea..96afc29184be 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -65,6 +65,7 @@ enum ip_set_extension { #define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT) #define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER) #define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT) +#define SET_WITH_FORCEADD(s) ((s)->flags & IPSET_CREATE_FLAG_FORCEADD) /* Extension id, in size order */ enum ip_set_ext_id { @@ -255,6 +256,8 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) cadt_flags |= IPSET_FLAG_WITH_COUNTERS; if (SET_WITH_COMMENT(set)) cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_FORCEADD(set)) + cadt_flags |= IPSET_FLAG_WITH_FORCEADD; if (!cadt_flags) return 0; diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index a1ca24408206..78c2f2e79920 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -185,13 +185,16 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_COUNTERS = (1 << IPSET_FLAG_BIT_WITH_COUNTERS), IPSET_FLAG_BIT_WITH_COMMENT = 4, IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT), + IPSET_FLAG_BIT_WITH_FORCEADD = 5, + IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), IPSET_FLAG_CADT_MAX = 15, }; /* The flag bits which correspond to the non-extension create flags */ enum ipset_create_flags { - IPSET_CREATE_FLAG_NONE = 0, - IPSET_CREATE_FLAG_MAX = 7, + IPSET_CREATE_FLAG_BIT_FORCEADD = 0, + IPSET_CREATE_FLAG_FORCEADD = (1 << IPSET_CREATE_FLAG_BIT_FORCEADD), + IPSET_CREATE_FLAG_BIT_MAX = 7, }; /* Commands with settype-specific attributes */ -- cgit v1.2.3 From b476b72a0f8514a5a4c561bab731ddd506a284e7 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Mar 2014 14:44:54 +0100 Subject: netfilter: trivial code cleanup and doc changes Changes while reading through the netfilter code. Added hint about how conntrack nf_conn refcnt is accessed. And renamed repl_hash to reply_hash for readability Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index b2ac6246b7e0..e10d1faa6d09 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -73,7 +73,13 @@ struct nf_conn_help { struct nf_conn { /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, - plus 1 for any connection(s) we are `master' for */ + * plus 1 for any connection(s) we are `master' for + * + * Hint, SKB address this struct and refcnt via skb->nfct and + * helpers nf_conntrack_get() and nf_conntrack_put(). + * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt, + * beware nf_ct_get() is different and don't inc refcnt. + */ struct nf_conntrack ct_general; spinlock_t lock; -- cgit v1.2.3 From b7779d06f9950e14a008a2de970b44233fe49c86 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Mar 2014 14:45:20 +0100 Subject: netfilter: conntrack: spinlock per cpu to protect special lists. One spinlock per cpu to protect dying/unconfirmed/template special lists. (These lists are now per cpu, a bit like the untracked ct) Add a @cpu field to nf_conn, to make sure we hold the appropriate spinlock at removal time. Signed-off-by: Eric Dumazet Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 3 ++- include/net/netns/conntrack.h | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index e10d1faa6d09..37252f71a380 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -82,7 +82,8 @@ struct nf_conn { */ struct nf_conntrack ct_general; - spinlock_t lock; + spinlock_t lock; + u16 cpu; /* XXX should I move this to the tail ? - Y.K */ /* These are my tuples; original and reply */ diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index fbcc7fa536dc..c6a8994e9922 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -62,6 +62,13 @@ struct nf_ip_net { #endif }; +struct ct_pcpu { + spinlock_t lock; + struct hlist_nulls_head unconfirmed; + struct hlist_nulls_head dying; + struct hlist_nulls_head tmpl; +}; + struct netns_ct { atomic_t count; unsigned int expect_count; @@ -86,9 +93,7 @@ struct netns_ct { struct kmem_cache *nf_conntrack_cachep; struct hlist_nulls_head *hash; struct hlist_head *expect_hash; - struct hlist_nulls_head unconfirmed; - struct hlist_nulls_head dying; - struct hlist_nulls_head tmpl; + struct ct_pcpu __percpu *pcpu_lists; struct ip_conntrack_stat __percpu *stat; struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; struct nf_exp_event_notifier __rcu *nf_expect_event_cb; -- cgit v1.2.3 From ca7433df3a672efc88e08222cfa4b3aa965ca324 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Mar 2014 14:46:01 +0100 Subject: netfilter: conntrack: seperate expect locking from nf_conntrack_lock Netfilter expectations are protected with the same lock as conntrack entries (nf_conntrack_lock). This patch split out expectations locking to use it's own lock (nf_conntrack_expect_lock). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 15308b8eb5b5..d12a631d0415 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -79,4 +79,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, extern spinlock_t nf_conntrack_lock ; +extern spinlock_t nf_conntrack_expect_lock; + #endif /* _NF_CONNTRACK_CORE_H */ -- cgit v1.2.3 From 93bb0ceb75be2fdfa9fc0dd1fb522d9ada515d9c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Mar 2014 14:46:13 +0100 Subject: netfilter: conntrack: remove central spinlock nf_conntrack_lock nf_conntrack_lock is a monolithic lock and suffers from huge contention on current generation servers (8 or more core/threads). Perf locking congestion is clear on base kernel: - 72.56% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock_bh - _raw_spin_lock_bh + 25.33% init_conntrack + 24.86% nf_ct_delete_from_lists + 24.62% __nf_conntrack_confirm + 24.38% destroy_conntrack + 0.70% tcp_packet + 2.21% ksoftirqd/6 [kernel.kallsyms] [k] fib_table_lookup + 1.15% ksoftirqd/6 [kernel.kallsyms] [k] __slab_free + 0.77% ksoftirqd/6 [kernel.kallsyms] [k] inet_getpeer + 0.70% ksoftirqd/6 [nf_conntrack] [k] nf_ct_delete + 0.55% ksoftirqd/6 [ip_tables] [k] ipt_do_table This patch change conntrack locking and provides a huge performance improvement. SYN-flood attack tested on a 24-core E5-2695v2(ES) with 10Gbit/s ixgbe (with tool trafgen): Base kernel: 810.405 new conntrack/sec After patch: 2.233.876 new conntrack/sec Notice other floods attack (SYN+ACK or ACK) can easily be deflected using: # iptables -A INPUT -m state --state INVALID -j DROP # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0 Use an array of hashed spinlocks to protect insertions/deletions of conntracks into the hash table. 1024 spinlocks seem to give good results, at minimal cost (4KB memory). Due to lockdep max depth, 1024 becomes 8 if CONFIG_LOCKDEP=y The hash resize is a bit tricky, because we need to take all locks in the array. A seqcount_t is used to synchronize the hash table users with the resizing process. Signed-off-by: Eric Dumazet Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 7 ++++++- include/net/netns/conntrack.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index d12a631d0415..cc0c18827602 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -77,7 +77,12 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *proto); -extern spinlock_t nf_conntrack_lock ; +#ifdef CONFIG_LOCKDEP +# define CONNTRACK_LOCKS 8 +#else +# define CONNTRACK_LOCKS 1024 +#endif +extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; extern spinlock_t nf_conntrack_expect_lock; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index c6a8994e9922..773cce308bc6 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -5,6 +5,7 @@ #include #include #include +#include struct ctl_table_header; struct nf_conntrack_ecache; @@ -90,6 +91,7 @@ struct netns_ct { int sysctl_checksum; unsigned int htable_size; + seqcount_t generation; struct kmem_cache *nf_conntrack_cachep; struct hlist_nulls_head *hash; struct hlist_head *expect_hash; -- cgit v1.2.3 From 62472bcefb56ae9c3a6be3284949ce758656cdec Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 7 Mar 2014 19:08:30 +0100 Subject: netfilter: nf_tables: restore context for expression destructors In order to fix set destruction notifications and get rid of unnecessary members in private data structures, pass the context to expressions' destructor functions again. In order to do so, replace various members in the nft_rule_trans structure by the full context. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5af56da6d6c6..e6bc14d8fa9a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -289,7 +289,8 @@ struct nft_expr_ops { int (*init)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); - void (*destroy)(const struct nft_expr *expr); + void (*destroy)(const struct nft_ctx *ctx, + const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr); int (*validate)(const struct nft_ctx *ctx, @@ -343,19 +344,13 @@ struct nft_rule { * struct nft_rule_trans - nf_tables rule update in transaction * * @list: used internally + * @ctx: rule context * @rule: rule that needs to be updated - * @chain: chain that this rule belongs to - * @table: table for which this chain applies - * @nlh: netlink header of the message that contain this update - * @family: family expressesed as AF_* */ struct nft_rule_trans { struct list_head list; + struct nft_ctx ctx; struct nft_rule *rule; - const struct nft_chain *chain; - const struct nft_table *table; - const struct nlmsghdr *nlh; - u8 family; }; static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) -- cgit v1.2.3