From 7cffcade57a429667447c4f41d8414bbcf1b3aaa Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Tue, 13 Dec 2022 23:46:52 +0800 Subject: xen: make remove callback of xen driver void returned Since commit fc7a6209d571 ("bus: Make remove callback return void") forces bus_type::remove be void-returned, it doesn't make much sense for any bus based driver implementing remove callbalk to return non-void to its caller. This change is for xen bus based drivers. Acked-by: Juergen Gross Signed-off-by: Dawei Li Link: https://lore.kernel.org/r/TYCP286MB23238119AB4DF190997075C9CAE39@TYCP286MB2323.JPNP286.PROD.OUTLOOK.COM Signed-off-by: Juergen Gross --- net/9p/trans_xen.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index aaa5fd364691..9950e1a5acb7 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -303,13 +303,12 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) kfree(priv); } -static int xen_9pfs_front_remove(struct xenbus_device *dev) +static void xen_9pfs_front_remove(struct xenbus_device *dev) { struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); dev_set_drvdata(&dev->dev, NULL); xen_9pfs_front_free(priv); - return 0; } static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, -- cgit v1.2.3 From b18cba09e374637a0a3759d856a6bca94c133952 Mon Sep 17 00:00:00 2001 From: minoura makoto Date: Tue, 13 Dec 2022 13:14:31 +0900 Subject: SUNRPC: ensure the matching upcall is in-flight upon downcall Commit 9130b8dbc6ac ("SUNRPC: allow for upcalls for the same uid but different gss service") introduced `auth` argument to __gss_find_upcall(), but in gss_pipe_downcall() it was left as NULL since it (and auth->service) was not (yet) determined. When multiple upcalls with the same uid and different service are ongoing, it could happen that __gss_find_upcall(), which returns the first match found in the pipe->in_downcall list, could not find the correct gss_msg corresponding to the downcall we are looking for. Moreover, it might return a msg which is not sent to rpc.gssd yet. We could see mount.nfs process hung in D state with multiple mount.nfs are executed in parallel. The call trace below is of CentOS 7.9 kernel-3.10.0-1160.24.1.el7.x86_64 but we observed the same hang w/ elrepo kernel-ml-6.0.7-1.el7. PID: 71258 TASK: ffff91ebd4be0000 CPU: 36 COMMAND: "mount.nfs" #0 [ffff9203ca3234f8] __schedule at ffffffffa3b8899f #1 [ffff9203ca323580] schedule at ffffffffa3b88eb9 #2 [ffff9203ca323590] gss_cred_init at ffffffffc0355818 [auth_rpcgss] #3 [ffff9203ca323658] rpcauth_lookup_credcache at ffffffffc0421ebc [sunrpc] #4 [ffff9203ca3236d8] gss_lookup_cred at ffffffffc0353633 [auth_rpcgss] #5 [ffff9203ca3236e8] rpcauth_lookupcred at ffffffffc0421581 [sunrpc] #6 [ffff9203ca323740] rpcauth_refreshcred at ffffffffc04223d3 [sunrpc] #7 [ffff9203ca3237a0] call_refresh at ffffffffc04103dc [sunrpc] #8 [ffff9203ca3237b8] __rpc_execute at ffffffffc041e1c9 [sunrpc] #9 [ffff9203ca323820] rpc_execute at ffffffffc0420a48 [sunrpc] The scenario is like this. Let's say there are two upcalls for services A and B, A -> B in pipe->in_downcall, B -> A in pipe->pipe. When rpc.gssd reads pipe to get the upcall msg corresponding to service B from pipe->pipe and then writes the response, in gss_pipe_downcall the msg corresponding to service A will be picked because only uid is used to find the msg and it is before the one for B in pipe->in_downcall. And the process waiting for the msg corresponding to service A will be woken up. Actual scheduing of that process might be after rpc.gssd processes the next msg. In rpc_pipe_generic_upcall it clears msg->errno (for A). The process is scheduled to see gss_msg->ctx == NULL and gss_msg->msg.errno == 0, therefore it cannot break the loop in gss_create_upcall and is never woken up after that. This patch adds a simple check to ensure that a msg which is not sent to rpc.gssd yet is not chosen as the matching upcall upon receiving a downcall. Signed-off-by: minoura makoto Signed-off-by: Hiroshi Shimamoto Tested-by: Hiroshi Shimamoto Cc: Trond Myklebust Fixes: 9130b8dbc6ac ("SUNRPC: allow for upcalls for same uid but different gss service") Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 7bb247c51e2f..2d7b1e03110a 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -302,7 +302,7 @@ __gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; - if (auth && pos->auth->service != auth->service) + if (pos->auth->service != auth->service) continue; refcount_inc(&pos->count); return pos; @@ -686,6 +686,21 @@ out: return err; } +static struct gss_upcall_msg * +gss_find_downcall(struct rpc_pipe *pipe, kuid_t uid) +{ + struct gss_upcall_msg *pos; + list_for_each_entry(pos, &pipe->in_downcall, list) { + if (!uid_eq(pos->uid, uid)) + continue; + if (!rpc_msg_is_inflight(&pos->msg)) + continue; + refcount_inc(&pos->count); + return pos; + } + return NULL; +} + #define MSG_BUF_MAXSIZE 1024 static ssize_t @@ -732,7 +747,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) err = -ENOENT; /* Find a matching upcall */ spin_lock(&pipe->lock); - gss_msg = __gss_find_upcall(pipe, uid, NULL); + gss_msg = gss_find_downcall(pipe, uid); if (gss_msg == NULL) { spin_unlock(&pipe->lock); goto err_put_ctx; -- cgit v1.2.3 From 54c3f1a81421f85e60ae2eaae7be3727a09916ee Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Dec 2022 16:47:00 -0800 Subject: bpf: pull before calling skb_postpull_rcsum() Anand hit a BUG() when pulling off headers on egress to a SW tunnel. We get to skb_checksum_help() with an invalid checksum offset (commit d7ea0d9df2a6 ("net: remove two BUG() from skb_checksum_help()") converted those BUGs to WARN_ONs()). He points out oddness in how skb_postpull_rcsum() gets used. Indeed looks like we should pull before "postpull", otherwise the CHECKSUM_PARTIAL fixup from skb_postpull_rcsum() will not be able to do its job: if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; Reported-by: Anand Parthasarathy Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper") Signed-off-by: Jakub Kicinski Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20221220004701.402165-1-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 929358677183..43cc1fe58a2c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3180,15 +3180,18 @@ static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { + void *old_data; + /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; - skb_postpull_rcsum(skb, skb->data + off, len); - memmove(skb->data + len, skb->data, off); + old_data = skb->data; __skb_pull(skb, len); + skb_postpull_rcsum(skb, old_data + off, len); + memmove(skb->data, old_data, off); return 0; } -- cgit v1.2.3 From 5eb119da94ac5d67a31eaa869621dc6e25eb125e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 15 Dec 2022 15:16:33 +0100 Subject: netfilter: conntrack: fix ipv6 exthdr error check smatch warnings: net/netfilter/nf_conntrack_proto.c:167 nf_confirm() warn: unsigned 'protoff' is never less than zero. We need to check if ipv6_skip_exthdr() returned an error, but protoff is unsigned. Use a signed integer for this. Fixes: a70e483460d5 ("netfilter: conntrack: merge ipv4+ipv6 confirm functions") Reported-by: kernel test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 99323fb12d0f..ccef340be575 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -141,6 +141,7 @@ unsigned int nf_confirm(void *priv, struct nf_conn *ct; bool seqadj_needed; __be16 frag_off; + int start; u8 pnum; ct = nf_ct_get(skb, &ctinfo); @@ -163,9 +164,11 @@ unsigned int nf_confirm(void *priv, break; case NFPROTO_IPV6: pnum = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + start = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); + if (start < 0 || (frag_off & htons(~0x7)) != 0) return nf_conntrack_confirm(skb); + + protoff = start; break; default: return nf_conntrack_confirm(skb); -- cgit v1.2.3 From bed4a63ea4ae77cfe5aae004ef87379f0655260a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Dec 2022 20:07:52 +0100 Subject: netfilter: nf_tables: consolidate set description Add the following fields to the set description: - key type - data type - object type - policy - gc_int: garbage collection interval) - timeout: element timeout This prepares for stricter set type checks on updates in a follow up patch. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 58 +++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 832b881f7c17..1deecc1a6c00 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3780,8 +3780,7 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) static const struct nft_set_ops * nft_select_set_ops(const struct nft_ctx *ctx, const struct nlattr * const nla[], - const struct nft_set_desc *desc, - enum nft_set_policies policy) + const struct nft_set_desc *desc) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; @@ -3810,7 +3809,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, if (!ops->estimate(desc, flags, &est)) continue; - switch (policy) { + switch (desc->policy) { case NFT_SET_POL_PERFORMANCE: if (est.lookup < best.lookup) break; @@ -4392,7 +4391,6 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - u32 ktype, dtype, flags, policy, gc_int, objtype; struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; @@ -4405,10 +4403,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, struct nft_set *set; struct nft_ctx ctx; size_t alloc_size; - u64 timeout; char *name; int err, i; u16 udlen; + u32 flags; u64 size; if (nla[NFTA_SET_TABLE] == NULL || @@ -4419,10 +4417,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, memset(&desc, 0, sizeof(desc)); - ktype = NFT_DATA_VALUE; + desc.ktype = NFT_DATA_VALUE; if (nla[NFTA_SET_KEY_TYPE] != NULL) { - ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); - if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) + desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); + if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) return -EINVAL; } @@ -4447,17 +4445,17 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, return -EOPNOTSUPP; } - dtype = 0; + desc.dtype = 0; if (nla[NFTA_SET_DATA_TYPE] != NULL) { if (!(flags & NFT_SET_MAP)) return -EINVAL; - dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); - if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && - dtype != NFT_DATA_VERDICT) + desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); + if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && + desc.dtype != NFT_DATA_VERDICT) return -EINVAL; - if (dtype != NFT_DATA_VERDICT) { + if (desc.dtype != NFT_DATA_VERDICT) { if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); @@ -4472,34 +4470,34 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_OBJECT)) return -EINVAL; - objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); - if (objtype == NFT_OBJECT_UNSPEC || - objtype > NFT_OBJECT_MAX) + desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); + if (desc.objtype == NFT_OBJECT_UNSPEC || + desc.objtype > NFT_OBJECT_MAX) return -EOPNOTSUPP; } else if (flags & NFT_SET_OBJECT) return -EINVAL; else - objtype = NFT_OBJECT_UNSPEC; + desc.objtype = NFT_OBJECT_UNSPEC; - timeout = 0; + desc.timeout = 0; if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout); + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; } - gc_int = 0; + desc.gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } - policy = NFT_SET_POL_PERFORMANCE; + desc.policy = NFT_SET_POL_PERFORMANCE; if (nla[NFTA_SET_POLICY] != NULL) - policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); if (nla[NFTA_SET_DESC] != NULL) { err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); @@ -4544,7 +4542,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(&ctx, nla, &desc, policy); + ops = nft_select_set_ops(&ctx, nla, &desc); if (IS_ERR(ops)) return PTR_ERR(ops); @@ -4584,18 +4582,18 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, set->table = table; write_pnet(&set->net, net); set->ops = ops; - set->ktype = ktype; + set->ktype = desc.ktype; set->klen = desc.klen; - set->dtype = dtype; - set->objtype = objtype; + set->dtype = desc.dtype; + set->objtype = desc.objtype; set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; - set->policy = policy; + set->policy = desc.policy; set->udlen = udlen; set->udata = udata; - set->timeout = timeout; - set->gc_int = gc_int; + set->timeout = desc.timeout; + set->gc_int = desc.gc_int; set->field_count = desc.field_count; for (i = 0; i < desc.field_count; i++) -- cgit v1.2.3 From a8fe4154fa5a1bae590b243ed60f871e5a5e1378 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Dec 2022 18:00:10 +0100 Subject: netfilter: nf_tables: add function to create set stateful expressions Add a helper function to allocate and initialize the stateful expressions that are defined in a set. This patch allows to reuse this code from the set update path, to check that type of the update matches the existing set in the kernel. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 106 +++++++++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1deecc1a6c00..b9b0ae29f5f6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4388,6 +4388,59 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, return err; } +static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr * const *nla, + struct nft_expr **exprs, int *num_exprs, + u32 flags) +{ + struct nft_expr *expr; + int err, i; + + if (nla[NFTA_SET_EXPR]) { + expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_expr_alloc; + } + exprs[0] = expr; + (*num_exprs)++; + } else if (nla[NFTA_SET_EXPRESSIONS]) { + struct nlattr *tmp; + int left; + + if (!(flags & NFT_SET_EXPR)) { + err = -EINVAL; + goto err_set_expr_alloc; + } + i = 0; + nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX) { + err = -E2BIG; + goto err_set_expr_alloc; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_set_expr_alloc; + } + expr = nft_set_elem_expr_alloc(ctx, set, tmp); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_expr_alloc; + } + exprs[i++] = expr; + (*num_exprs)++; + } + } + + return 0; + +err_set_expr_alloc: + for (i = 0; i < *num_exprs; i++) + nft_expr_destroy(ctx, exprs[i]); + + return err; +} + static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { @@ -4395,7 +4448,6 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_set_ops *ops; - struct nft_expr *expr = NULL; struct net *net = info->net; struct nft_set_desc desc; struct nft_table *table; @@ -4403,6 +4455,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, struct nft_set *set; struct nft_ctx ctx; size_t alloc_size; + int num_exprs = 0; char *name; int err, i; u16 udlen; @@ -4529,6 +4582,8 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, return PTR_ERR(set); } } else { + struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {}; + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; @@ -4536,6 +4591,13 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); + if (err < 0) + return err; + + for (i = 0; i < num_exprs; i++) + nft_expr_destroy(&ctx, exprs[i]); + return 0; } @@ -4603,43 +4665,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) goto err_set_init; - if (nla[NFTA_SET_EXPR]) { - expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]); - if (IS_ERR(expr)) { - err = PTR_ERR(expr); - goto err_set_expr_alloc; - } - set->exprs[0] = expr; - set->num_exprs++; - } else if (nla[NFTA_SET_EXPRESSIONS]) { - struct nft_expr *expr; - struct nlattr *tmp; - int left; - - if (!(flags & NFT_SET_EXPR)) { - err = -EINVAL; - goto err_set_expr_alloc; - } - i = 0; - nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { - if (i == NFT_SET_EXPR_MAX) { - err = -E2BIG; - goto err_set_expr_alloc; - } - if (nla_type(tmp) != NFTA_LIST_ELEM) { - err = -EINVAL; - goto err_set_expr_alloc; - } - expr = nft_set_elem_expr_alloc(&ctx, set, tmp); - if (IS_ERR(expr)) { - err = PTR_ERR(expr); - goto err_set_expr_alloc; - } - set->exprs[i++] = expr; - set->num_exprs++; - } - } + err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags); + if (err < 0) + goto err_set_destroy; + set->num_exprs = num_exprs; set->handle = nf_tables_alloc_handle(table); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); @@ -4653,7 +4683,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); - +err_set_destroy: ops->destroy(set); err_set_init: kfree(set->name); -- cgit v1.2.3 From f6594c372afd5cec8b1e9ee9ea8f8819d59c6fb1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Dec 2022 20:09:00 +0100 Subject: netfilter: nf_tables: perform type checking for existing sets If a ruleset declares a set name that matches an existing set in the kernel, then validate that this declaration really refers to the same set, otherwise bail out with EEXIST. Currently, the kernel reports success when adding a set that already exists in the kernel. This usually results in EINVAL errors at a later stage, when the user adds elements to the set, if the set declaration mismatches the existing set representation in the kernel. Add a new function to check that the set declaration really refers to the same existing set in the kernel. Fixes: 96518518cc41 ("netfilter: add nftables") Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b9b0ae29f5f6..319887f4d3ef 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4441,6 +4441,34 @@ err_set_expr_alloc: return err; } +static bool nft_set_is_same(const struct nft_set *set, + const struct nft_set_desc *desc, + struct nft_expr *exprs[], u32 num_exprs, u32 flags) +{ + int i; + + if (set->ktype != desc->ktype || + set->dtype != desc->dtype || + set->flags != flags || + set->klen != desc->klen || + set->dlen != desc->dlen || + set->field_count != desc->field_count || + set->num_exprs != num_exprs) + return false; + + for (i = 0; i < desc->field_count; i++) { + if (set->field_len[i] != desc->field_len[i]) + return false; + } + + for (i = 0; i < num_exprs; i++) { + if (set->exprs[i]->ops != exprs[i]->ops) + return false; + } + + return true; +} + static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { @@ -4595,10 +4623,16 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) return err; + err = 0; + if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); + err = -EEXIST; + } + for (i = 0; i < num_exprs; i++) nft_expr_destroy(&ctx, exprs[i]); - return 0; + return err; } if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) -- cgit v1.2.3 From 95637d91fefdb94d6e7389222ba9ddab0e9f5abe Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Tue, 20 Dec 2022 16:27:17 -0500 Subject: net: openvswitch: release vport resources on failure A recent commit introducing upcall packet accounting failed to properly release the vport object when the per-cpu stats struct couldn't be allocated. This can cause dangling pointers to dp objects long after they've been released. Cc: wangchuanlei Fixes: 1933ea365aa7 ("net: openvswitch: Add support to count upcall packets") Reported-by: syzbot+8f4e2dcfcb3209ac35f9@syzkaller.appspotmail.com Signed-off-by: Aaron Conole Acked-by: Eelco Chaudron Reviewed-by: Michal Swiatkowski Link: https://lore.kernel.org/r/20221220212717.526780-1-aconole@redhat.com Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9ca721c9fa71..a71795355aec 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1861,7 +1861,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); if (!vport->upcall_stats) { err = -ENOMEM; - goto err_destroy_portids; + goto err_destroy_vport; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1876,6 +1876,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; +err_destroy_vport: + ovs_dp_detach_port(vport); err_destroy_portids: kfree(rcu_dereference_raw(dp->upcall_portids)); err_unlock_and_destroy_meters: @@ -2323,7 +2325,7 @@ restart: vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); if (!vport->upcall_stats) { err = -ENOMEM; - goto exit_unlock_free; + goto exit_unlock_free_vport; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), @@ -2343,6 +2345,8 @@ restart: ovs_notify(&dp_vport_genl_family, reply, info); return 0; +exit_unlock_free_vport: + ovs_dp_detach_port(vport); exit_unlock_free: ovs_unlock(); kfree_skb(reply); -- cgit v1.2.3 From 7d803344fdc3e38079fabcf38b1e4cb6f8faa655 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Dec 2022 11:52:14 -0800 Subject: mptcp: fix deadlock in fastopen error path MatM reported a deadlock at fastopening time: INFO: task syz-executor.0:11454 blocked for more than 143 seconds. Tainted: G S 6.1.0-rc5-03226-gdb0157db5153 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor.0 state:D stack:25104 pid:11454 ppid:424 flags:0x00004006 Call Trace: context_switch kernel/sched/core.c:5191 [inline] __schedule+0x5c2/0x1550 kernel/sched/core.c:6503 schedule+0xe8/0x1c0 kernel/sched/core.c:6579 __lock_sock+0x142/0x260 net/core/sock.c:2896 lock_sock_nested+0xdb/0x100 net/core/sock.c:3466 __mptcp_close_ssk+0x1a3/0x790 net/mptcp/protocol.c:2328 mptcp_destroy_common+0x16a/0x650 net/mptcp/protocol.c:3171 mptcp_disconnect+0xb8/0x450 net/mptcp/protocol.c:3019 __inet_stream_connect+0x897/0xa40 net/ipv4/af_inet.c:720 tcp_sendmsg_fastopen+0x3dd/0x740 net/ipv4/tcp.c:1200 mptcp_sendmsg_fastopen net/mptcp/protocol.c:1682 [inline] mptcp_sendmsg+0x128a/0x1a50 net/mptcp/protocol.c:1721 inet6_sendmsg+0x11f/0x150 net/ipv6/af_inet6.c:663 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xf7/0x190 net/socket.c:734 ____sys_sendmsg+0x336/0x970 net/socket.c:2476 ___sys_sendmsg+0x122/0x1c0 net/socket.c:2530 __sys_sendmmsg+0x18d/0x460 net/socket.c:2616 __do_sys_sendmmsg net/socket.c:2645 [inline] __se_sys_sendmmsg net/socket.c:2642 [inline] __x64_sys_sendmmsg+0x9d/0x110 net/socket.c:2642 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f5920a75e7d RSP: 002b:00007f59201e8028 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00007f5920bb4f80 RCX: 00007f5920a75e7d RDX: 0000000000000001 RSI: 0000000020002940 RDI: 0000000000000005 RBP: 00007f5920ae7593 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000020004050 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007f5920bb4f80 R15: 00007f59201c8000 In the error path, tcp_sendmsg_fastopen() ends-up calling mptcp_disconnect(), and the latter tries to close each subflow, acquiring the socket lock on each of them. At fastopen time, we have a single subflow, and such subflow socket lock is already held by the called, causing the deadlock. We already track the 'fastopen in progress' status inside the msk socket. Use it to address the issue, making mptcp_disconnect() a no op when invoked from the fastopen (error) path and doing the relevant cleanup after releasing the subflow socket lock. While at the above, rename the fastopen status bit to something more meaningful. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/321 Fixes: fa9e57468aa1 ("mptcp: fix abba deadlock on fastopen") Reported-by: Mat Martineau Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 18 +++++++++++++++--- net/mptcp/protocol.h | 2 +- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f6f93957275b..907b435e2984 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1662,6 +1662,8 @@ static void mptcp_set_nospace(struct sock *sk) set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); } +static int mptcp_disconnect(struct sock *sk, int flags); + static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg, size_t len, int *copied_syn) { @@ -1672,9 +1674,9 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; msk->connect_flags = O_NONBLOCK; - msk->is_sendmsg = 1; + msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); - msk->is_sendmsg = 0; + msk->fastopening = 0; msg->msg_flags = saved_flags; release_sock(ssk); @@ -1688,6 +1690,8 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh */ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; + } else if (ret && ret != -EINPROGRESS) { + mptcp_disconnect(sk, 0); } return ret; @@ -2989,6 +2993,14 @@ static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); + /* We are on the fastopen error path. We can't call straight into the + * subflows cleanup code due to lock nesting (we are already under + * msk->firstsocket lock). Do nothing and leave the cleanup to the + * caller. + */ + if (msk->fastopening) + return 0; + inet_sk_state_store(sk, TCP_CLOSE); mptcp_stop_timer(sk); @@ -3532,7 +3544,7 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ - if (msk->is_sendmsg) + if (msk->fastopening) err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); else err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 955fb3d88eb3..f47d3e4018b5 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -295,7 +295,7 @@ struct mptcp_sock { u8 recvmsg_inq:1, cork:1, nodelay:1, - is_sendmsg:1; + fastopening:1; int connect_flags; struct work_struct work; struct sk_buff *ooo_last_skb; -- cgit v1.2.3 From fec3adfd754ccc99a7230e8ab9f105b65fb07bcc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Dec 2022 11:52:15 -0800 Subject: mptcp: fix lockdep false positive MattB reported a lockdep splat in the mptcp listener code cleanup: WARNING: possible circular locking dependency detected packetdrill/14278 is trying to acquire lock: ffff888017d868f0 ((work_completion)(&msk->work)){+.+.}-{0:0}, at: __flush_work (kernel/workqueue.c:3069) but task is already holding lock: ffff888017d84130 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close (net/mptcp/protocol.c:2973) which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (sk_lock-AF_INET){+.+.}-{0:0}: __lock_acquire (kernel/locking/lockdep.c:5055) lock_acquire (kernel/locking/lockdep.c:466) lock_sock_nested (net/core/sock.c:3463) mptcp_worker (net/mptcp/protocol.c:2614) process_one_work (kernel/workqueue.c:2294) worker_thread (include/linux/list.h:292) kthread (kernel/kthread.c:376) ret_from_fork (arch/x86/entry/entry_64.S:312) -> #0 ((work_completion)(&msk->work)){+.+.}-{0:0}: check_prev_add (kernel/locking/lockdep.c:3098) validate_chain (kernel/locking/lockdep.c:3217) __lock_acquire (kernel/locking/lockdep.c:5055) lock_acquire (kernel/locking/lockdep.c:466) __flush_work (kernel/workqueue.c:3070) __cancel_work_timer (kernel/workqueue.c:3160) mptcp_cancel_work (net/mptcp/protocol.c:2758) mptcp_subflow_queue_clean (net/mptcp/subflow.c:1817) __mptcp_close_ssk (net/mptcp/protocol.c:2363) mptcp_destroy_common (net/mptcp/protocol.c:3170) mptcp_destroy (include/net/sock.h:1495) __mptcp_destroy_sock (net/mptcp/protocol.c:2886) __mptcp_close (net/mptcp/protocol.c:2959) mptcp_close (net/mptcp/protocol.c:2974) inet_release (net/ipv4/af_inet.c:432) __sock_release (net/socket.c:651) sock_close (net/socket.c:1367) __fput (fs/file_table.c:320) task_work_run (kernel/task_work.c:181 (discriminator 1)) exit_to_user_mode_prepare (include/linux/resume_user_mode.h:49) syscall_exit_to_user_mode (kernel/entry/common.c:130) do_syscall_64 (arch/x86/entry/common.c:87) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sk_lock-AF_INET); lock((work_completion)(&msk->work)); lock(sk_lock-AF_INET); lock((work_completion)(&msk->work)); *** DEADLOCK *** The report is actually a false positive, since the only existing lock nesting is the msk socket lock acquired by the mptcp work. cancel_work_sync() is invoked without the relevant socket lock being held, but under a different (the msk listener) socket lock. We could silence the splat adding a per workqueue dynamic lockdep key, but that looks overkill. Instead just tell lockdep the msk socket lock is not held around cancel_work_sync(). Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/322 Fixes: 30e51b923e43 ("mptcp: fix unreleased socket in accept queue") Reported-by: Matthieu Baerts Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 2 +- net/mptcp/subflow.c | 19 +++++++++++++++++-- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 907b435e2984..b7ad030dfe89 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2357,7 +2357,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, /* otherwise tcp will dispose of the ssk and subflow ctx */ if (ssk->sk_state == TCP_LISTEN) { tcp_set_state(ssk, TCP_CLOSE); - mptcp_subflow_queue_clean(ssk); + mptcp_subflow_queue_clean(sk, ssk); inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f47d3e4018b5..a0d1658ce59e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -628,7 +628,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); -void mptcp_subflow_queue_clean(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d1d32a66ae3f..bd387d4b5a38 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1791,7 +1791,7 @@ static void subflow_state_change(struct sock *sk) } } -void mptcp_subflow_queue_clean(struct sock *listener_ssk) +void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct mptcp_sock *msk, *next, *head = NULL; @@ -1840,8 +1840,23 @@ void mptcp_subflow_queue_clean(struct sock *listener_ssk) do_cancel_work = __mptcp_close(sk, 0); release_sock(sk); - if (do_cancel_work) + if (do_cancel_work) { + /* lockdep will report a false positive ABBA deadlock + * between cancel_work_sync and the listener socket. + * The involved locks belong to different sockets WRT + * the existing AB chain. + * Using a per socket key is problematic as key + * deregistration requires process context and must be + * performed at socket disposal time, in atomic + * context. + * Just tell lockdep to consider the listener socket + * released here. + */ + mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); mptcp_cancel_work(sk); + mutex_acquire(&listener_sk->sk_lock.dep_map, + SINGLE_DEPTH_NESTING, 0, _RET_IP_); + } sock_put(sk); } -- cgit v1.2.3 From 123b99619cca94bdca0bf7bde9abe28f0a0dfe06 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Dec 2022 20:10:12 +0100 Subject: netfilter: nf_tables: honor set timeout and garbage collection updates Set timeout and garbage collection interval updates are ignored on updates. Add transaction to update global set element timeout and garbage collection interval. Fixes: 96518518cc41 ("netfilter: add nftables") Suggested-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 63 ++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 319887f4d3ef..8c09e4d12ac1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -465,8 +465,9 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) return 0; } -static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, - struct nft_set *set) +static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + struct nft_set *set, + const struct nft_set_desc *desc) { struct nft_trans *trans; @@ -474,17 +475,28 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, if (trans == NULL) return -ENOMEM; - if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; + if (desc) { + nft_trans_set_update(trans) = true; + nft_trans_set_gc_int(trans) = desc->gc_int; + nft_trans_set_timeout(trans) = desc->timeout; + } nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } +static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + return __nft_trans_set_add(ctx, msg_type, set, NULL); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -4044,8 +4056,10 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { - struct nlmsghdr *nlh; + u64 timeout = READ_ONCE(set->timeout); + u32 gc_int = READ_ONCE(set->gc_int); u32 portid = ctx->portid; + struct nlmsghdr *nlh; struct nlattr *nest; u32 seq = ctx->seq; int i; @@ -4081,13 +4095,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype))) goto nla_put_failure; - if (set->timeout && + if (timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, - nf_jiffies64_to_msecs(set->timeout), + nf_jiffies64_to_msecs(timeout), NFTA_SET_PAD)) goto nla_put_failure; - if (set->gc_int && - nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int))) + if (gc_int && + nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int))) goto nla_put_failure; if (set->policy != NFT_SET_POL_PERFORMANCE) { @@ -4632,7 +4646,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, for (i = 0; i < num_exprs; i++) nft_expr_destroy(&ctx, exprs[i]); - return err; + if (err < 0) + return err; + + return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc); } if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -6070,7 +6087,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } else if (set->flags & NFT_SET_TIMEOUT && !(flags & NFT_SET_ELEM_INTERVAL_END)) { - timeout = set->timeout; + timeout = READ_ONCE(set->timeout); } expiration = 0; @@ -6171,7 +6188,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err_parse_key_end; - if (timeout != set->timeout) { + if (timeout != READ_ONCE(set->timeout)) { err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); if (err < 0) goto err_parse_key_end; @@ -9093,14 +9110,20 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: - nft_clear(net, nft_trans_set(trans)); - /* This avoids hitting -EBUSY when deleting the table - * from the transaction. - */ - if (nft_set_is_anonymous(nft_trans_set(trans)) && - !list_empty(&nft_trans_set(trans)->bindings)) - trans->ctx.table->use--; + if (nft_trans_set_update(trans)) { + struct nft_set *set = nft_trans_set(trans); + WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans)); + WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans)); + } else { + nft_clear(net, nft_trans_set(trans)); + /* This avoids hitting -EBUSY when deleting the table + * from the transaction. + */ + if (nft_set_is_anonymous(nft_trans_set(trans)) && + !list_empty(&nft_trans_set(trans)->bindings)) + trans->ctx.table->use--; + } nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); nft_trans_destroy(trans); @@ -9322,6 +9345,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: + if (nft_trans_set_update(trans)) { + nft_trans_destroy(trans); + break; + } trans->ctx.table->use--; if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); -- cgit v1.2.3 From 399ab7fe0fa0d846881685fd4e57e9a8ef7559f7 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 22 Dec 2022 11:51:19 +0800 Subject: net: sched: fix memory leak in tcindex_set_parms Syzkaller reports a memory leak as follows: ==================================== BUG: memory leak unreferenced object 0xffff88810c287f00 (size 256): comm "syz-executor105", pid 3600, jiffies 4294943292 (age 12.990s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmalloc_trace+0x20/0x90 mm/slab_common.c:1046 [] kmalloc include/linux/slab.h:576 [inline] [] kmalloc_array include/linux/slab.h:627 [inline] [] kcalloc include/linux/slab.h:659 [inline] [] tcf_exts_init include/net/pkt_cls.h:250 [inline] [] tcindex_set_parms+0xa7/0xbe0 net/sched/cls_tcindex.c:342 [] tcindex_change+0xdf/0x120 net/sched/cls_tcindex.c:553 [] tc_new_tfilter+0x4f2/0x1100 net/sched/cls_api.c:2147 [] rtnetlink_rcv_msg+0x4dc/0x5d0 net/core/rtnetlink.c:6082 [] netlink_rcv_skb+0x87/0x1d0 net/netlink/af_netlink.c:2540 [] netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] [] netlink_unicast+0x397/0x4c0 net/netlink/af_netlink.c:1345 [] netlink_sendmsg+0x396/0x710 net/netlink/af_netlink.c:1921 [] sock_sendmsg_nosec net/socket.c:714 [inline] [] sock_sendmsg+0x56/0x80 net/socket.c:734 [] ____sys_sendmsg+0x178/0x410 net/socket.c:2482 [] ___sys_sendmsg+0xa8/0x110 net/socket.c:2536 [] __sys_sendmmsg+0x105/0x330 net/socket.c:2622 [] __do_sys_sendmmsg net/socket.c:2651 [inline] [] __se_sys_sendmmsg net/socket.c:2648 [inline] [] __x64_sys_sendmmsg+0x24/0x30 net/socket.c:2648 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd ==================================== Kernel uses tcindex_change() to change an existing filter properties. Yet the problem is that, during the process of changing, if `old_r` is retrieved from `p->perfect`, then kernel uses tcindex_alloc_perfect_hash() to newly allocate filter results, uses tcindex_filter_result_init() to clear the old filter result, without destroying its tcf_exts structure, which triggers the above memory leak. To be more specific, there are only two source for the `old_r`, according to the tcindex_lookup(). `old_r` is retrieved from `p->perfect`, or `old_r` is retrieved from `p->h`. * If `old_r` is retrieved from `p->perfect`, kernel uses tcindex_alloc_perfect_hash() to newly allocate the filter results. Then `r` is assigned with `cp->perfect + handle`, which is newly allocated. So condition `old_r && old_r != r` is true in this situation, and kernel uses tcindex_filter_result_init() to clear the old filter result, without destroying its tcf_exts structure * If `old_r` is retrieved from `p->h`, then `p->perfect` is NULL according to the tcindex_lookup(). Considering that `cp->h` is directly copied from `p->h` and `p->perfect` is NULL, `r` is assigned with `tcindex_lookup(cp, handle)`, whose value should be the same as `old_r`, so condition `old_r && old_r != r` is false in this situation, kernel ignores using tcindex_filter_result_init() to clear the old filter result. So only when `old_r` is retrieved from `p->perfect` does kernel use tcindex_filter_result_init() to clear the old filter result, which triggers the above memory leak. Considering that there already exists a tc_filter_wq workqueue to destroy the old tcindex_data by tcindex_partial_destroy_work() at the end of tcindex_set_parms(), this patch solves this memory leak bug by removing this old filter result clearing part and delegating it to the tc_filter_wq workqueue. Note that this patch doesn't introduce any other issues. If `old_r` is retrieved from `p->perfect`, this patch just delegates old filter result clearing part to the tc_filter_wq workqueue; If `old_r` is retrieved from `p->h`, kernel doesn't reach the old filter result clearing part, so removing this part has no effect. [Thanks to the suggestion from Jakub Kicinski, Cong Wang, Paolo Abeni and Dmitry Vyukov] Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()") Link: https://lore.kernel.org/all/0000000000001de5c505ebc9ec59@google.com/ Reported-by: syzbot+232ebdbd36706c965ebf@syzkaller.appspotmail.com Tested-by: syzbot+232ebdbd36706c965ebf@syzkaller.appspotmail.com Cc: Cong Wang Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Dmitry Vyukov Acked-by: Paolo Abeni Signed-off-by: Hawkins Jiawei Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index eb0e9458e722..ee2a050c887b 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -333,7 +333,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcindex_filter_result *r, struct nlattr **tb, struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { - struct tcindex_filter_result new_filter_result, *old_r = r; + struct tcindex_filter_result new_filter_result; struct tcindex_data *cp = NULL, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_result cr = {}; @@ -402,7 +402,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = tcindex_filter_result_init(&new_filter_result, cp, net); if (err < 0) goto errout_alloc; - if (old_r) + if (r) cr = r->res; err = -EBUSY; @@ -479,14 +479,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, tcf_bind_filter(tp, &cr, base); } - if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r, cp, net); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - oldp = p; r->res = cr; tcf_exts_change(&r->exts, &e); -- cgit v1.2.3 From df49908f3c52d211aea5e2a14a93bbe67a2cb3af Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 23 Dec 2022 11:37:18 +0400 Subject: nfc: Fix potential resource leaks nfc_get_device() take reference for the device, add missing nfc_put_device() to release it when not need anymore. Also fix the style warnning by use error EOPNOTSUPP instead of ENOTSUPP. Fixes: 5ce3f32b5264 ("NFC: netlink: SE API implementation") Fixes: 29e76924cf08 ("nfc: netlink: Add capability to reply to vendor_cmd with data") Signed-off-by: Miaoqian Lin Signed-off-by: David S. Miller --- net/nfc/netlink.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 9d91087b9399..1fc339084d89 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1497,6 +1497,7 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; + int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || @@ -1510,25 +1511,37 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) if (!dev) return -ENODEV; - if (!dev->ops || !dev->ops->se_io) - return -ENOTSUPP; + if (!dev->ops || !dev->ops->se_io) { + rc = -EOPNOTSUPP; + goto put_dev; + } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); - if (apdu_len == 0) - return -EINVAL; + if (apdu_len == 0) { + rc = -EINVAL; + goto put_dev; + } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); - if (!apdu) - return -EINVAL; + if (!apdu) { + rc = -EINVAL; + goto put_dev; + } ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; + if (!ctx) { + rc = -ENOMEM; + goto put_dev; + } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; - return nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); + rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); + +put_dev: + nfc_put_device(dev); + return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, @@ -1551,14 +1564,21 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); - if (!dev || !dev->vendor_cmds || !dev->n_vendor_cmds) + if (!dev) return -ENODEV; + if (!dev->vendor_cmds || !dev->n_vendor_cmds) { + err = -ENODEV; + goto put_dev; + } + if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); - if (data_len == 0) - return -EINVAL; + if (data_len == 0) { + err = -EINVAL; + goto put_dev; + } } else { data = NULL; data_len = 0; @@ -1573,10 +1593,14 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; - return err; + goto put_dev; } - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + +put_dev: + nfc_put_device(dev); + return err; } /* message building helper */ -- cgit v1.2.3 From 0e50d999903c009b6a9cd2277c82d6798d982e31 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Dec 2022 14:49:00 +0000 Subject: rxrpc: Fix a couple of potential use-after-frees At the end of rxrpc_recvmsg(), if a call is found, the call is put and then a trace line is emitted referencing that call in a couple of places - but the call may have been deallocated by the time those traces happen. Fix this by stashing the call debug_id in a variable and passing that to the tracepoint rather than the call pointer. Fixes: 849979051cbc ("rxrpc: Add a tracepoint to follow what recvmsg does") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/recvmsg.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 36b25d003cf0..6ebd6440a2b7 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -388,13 +388,14 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct list_head *l; + unsigned int call_debug_id = 0; size_t copied = 0; long timeo; int ret; DEFINE_WAIT(wait); - trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0); + trace_rxrpc_recvmsg(0, rxrpc_recvmsg_enter, 0); if (flags & (MSG_OOB | MSG_TRUNC)) return -EOPNOTSUPP; @@ -431,7 +432,7 @@ try_again: if (list_empty(&rx->recvmsg_q)) { if (signal_pending(current)) goto wait_interrupted; - trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, 0); + trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(&rx->sk), &wait); @@ -450,7 +451,8 @@ try_again: rxrpc_get_call(call, rxrpc_call_get_recvmsg); write_unlock(&rx->recvmsg_lock); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0); + call_debug_id = call->debug_id; + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); /* We're going to drop the socket lock, so we need to lock the call * against interference by sendmsg. @@ -531,7 +533,7 @@ try_again: error_unlock_call: mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_recvmsg); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret); + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; error_requeue_call: @@ -539,14 +541,14 @@ error_requeue_call: write_lock(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); write_unlock(&rx->recvmsg_lock); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0); + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); } error_no_call: release_sock(&rx->sk); error_trace: - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret); + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; wait_interrupted: -- cgit v1.2.3 From 9deb1e9fb88b1120a908676fa33bdf9e2eeaefce Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Mon, 26 Dec 2022 14:48:23 +0300 Subject: net/ethtool/ioctl: return -EOPNOTSUPP if we have no phy stats It's not very useful to copy back an empty ethtool_stats struct and return 0 if we didn't actually have any stats. This also allows for further simplification of this function in the future commits. Signed-off-by: Daniil Tatianin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index c2f1a542e6fa..932fa8225b2f 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2099,7 +2099,8 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; - WARN_ON_ONCE(!n_stats); + if (WARN_ON_ONCE(!n_stats)) + return -EOPNOTSUPP; if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; -- cgit v1.2.3 From fd4778581d61d8848b532f8cdc9b325138748437 Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Mon, 26 Dec 2022 14:48:24 +0300 Subject: net/ethtool/ioctl: remove if n_stats checks from ethtool_get_phy_stats Now that we always early return if we don't have any stats we can remove these checks as they're no longer necessary. Signed-off-by: Daniil Tatianin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 932fa8225b2f..85f0cffdcec8 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2107,28 +2107,24 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) stats.n_stats = n_stats; - if (n_stats) { - data = vzalloc(array_size(n_stats, sizeof(u64))); - if (!data) - return -ENOMEM; + data = vzalloc(array_size(n_stats, sizeof(u64))); + if (!data) + return -ENOMEM; - if (phydev && !ops->get_ethtool_phy_stats && - phy_ops && phy_ops->get_stats) { - ret = phy_ops->get_stats(phydev, &stats, data); - if (ret < 0) - goto out; - } else { - ops->get_ethtool_phy_stats(dev, &stats, data); - } + if (phydev && !ops->get_ethtool_phy_stats && + phy_ops && phy_ops->get_stats) { + ret = phy_ops->get_stats(phydev, &stats, data); + if (ret < 0) + goto out; } else { - data = NULL; + ops->get_ethtool_phy_stats(dev, &stats, data); } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) + if (copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) goto out; ret = 0; -- cgit v1.2.3 From 201ed315f9676809cd5b20a39206e964106d4f27 Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Mon, 26 Dec 2022 14:48:25 +0300 Subject: net/ethtool/ioctl: split ethtool_get_phy_stats into multiple helpers So that it's easier to follow and make sense of the branching and various conditions. Stats retrieval has been split into two separate functions ethtool_get_phy_stats_phydev & ethtool_get_phy_stats_ethtool. The former attempts to retrieve the stats using phydev & phy_ops, while the latter uses ethtool_ops. Actual n_stats validation & array allocation has been moved into a new ethtool_vzalloc_stats_array helper. This also fixes a potential NULL dereference of ops->get_ethtool_phy_stats where it was getting called in an else branch unconditionally without making sure it was actually present. Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Signed-off-by: Daniil Tatianin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 102 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 85f0cffdcec8..646b3e490c71 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2078,23 +2078,8 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return ret; } -static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +static int ethtool_vzalloc_stats_array(int n_stats, u64 **data) { - const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - struct ethtool_stats stats; - u64 *data; - int ret, n_stats; - - if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) - return -EOPNOTSUPP; - - if (phydev && !ops->get_ethtool_phy_stats && - phy_ops && phy_ops->get_sset_count) - n_stats = phy_ops->get_sset_count(phydev); - else - n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) @@ -2102,31 +2087,82 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (WARN_ON_ONCE(!n_stats)) return -EOPNOTSUPP; + *data = vzalloc(array_size(n_stats, sizeof(u64))); + if (!*data) + return -ENOMEM; + + return 0; +} + +static int ethtool_get_phy_stats_phydev(struct phy_device *phydev, + struct ethtool_stats *stats, + u64 **data) + { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; + int n_stats, ret; + + if (!phy_ops || !phy_ops->get_sset_count || !phy_ops->get_stats) + return -EOPNOTSUPP; + + n_stats = phy_ops->get_sset_count(phydev); + + ret = ethtool_vzalloc_stats_array(n_stats, data); + if (ret) + return ret; + + stats->n_stats = n_stats; + return phy_ops->get_stats(phydev, stats, *data); +} + +static int ethtool_get_phy_stats_ethtool(struct net_device *dev, + struct ethtool_stats *stats, + u64 **data) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + int n_stats, ret; + + if (!ops || !ops->get_sset_count || ops->get_ethtool_phy_stats) + return -EOPNOTSUPP; + + n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); + + ret = ethtool_vzalloc_stats_array(n_stats, data); + if (ret) + return ret; + + stats->n_stats = n_stats; + ops->get_ethtool_phy_stats(dev, stats, *data); + + return 0; +} + +static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +{ + struct phy_device *phydev = dev->phydev; + struct ethtool_stats stats; + u64 *data = NULL; + int ret = -EOPNOTSUPP; + if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; - stats.n_stats = n_stats; + if (phydev) + ret = ethtool_get_phy_stats_phydev(phydev, &stats, &data); - data = vzalloc(array_size(n_stats, sizeof(u64))); - if (!data) - return -ENOMEM; + if (ret == -EOPNOTSUPP) + ret = ethtool_get_phy_stats_ethtool(dev, &stats, &data); - if (phydev && !ops->get_ethtool_phy_stats && - phy_ops && phy_ops->get_stats) { - ret = phy_ops->get_stats(phydev, &stats, data); - if (ret < 0) - goto out; - } else { - ops->get_ethtool_phy_stats(dev, &stats, data); - } + if (ret) + goto out; - ret = -EFAULT; - if (copy_to_user(useraddr, &stats, sizeof(stats))) + if (copy_to_user(useraddr, &stats, sizeof(stats))) { + ret = -EFAULT; goto out; + } + useraddr += sizeof(stats); - if (copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) - goto out; - ret = 0; + if (copy_to_user(useraddr, data, array_size(stats.n_stats, sizeof(u64)))) + ret = -EFAULT; out: vfree(data); -- cgit v1.2.3 From 936a192f974018b4f6040f6f77b1cc1e75bd8666 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 26 Dec 2022 22:27:52 +0900 Subject: tcp: Add TIME_WAIT sockets in bhash2. Jiri Slaby reported regression of bind() with a simple repro. [0] The repro creates a TIME_WAIT socket and tries to bind() a new socket with the same local address and port. Before commit 28044fc1d495 ("net: Add a bhash2 table hashed by port and address"), the bind() failed with -EADDRINUSE, but now it succeeds. The cited commit should have put TIME_WAIT sockets into bhash2; otherwise, inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind() requests if the address is not a wildcard one. The straight option is to move sk_bind2_node from struct sock to struct sock_common to add twsk to bhash2 as implemented as RFC. [1] However, the binary layout change in the struct sock could affect performances moving hot fields on different cachelines. To avoid that, we add another TIME_WAIT list in inet_bind2_bucket and check it while validating bind(). [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/ [1]: https://lore.kernel.org/netdev/20221221151258.25748-2-kuniyu@amazon.com/ Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: Jiri Slaby Suggested-by: Paolo Abeni Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 26 ++++++++++++++++++++++---- net/ipv4/inet_hashtables.c | 8 +++++--- net/ipv4/inet_timewait_sock.c | 31 +++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b366ab9148f2..848ffc3e0239 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -173,22 +173,40 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, return false; } +static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, + kuid_t sk_uid, bool relax, + bool reuseport_cb_ok, bool reuseport_ok) +{ + if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) + return false; + + return inet_bind_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok); +} + static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { + struct inet_timewait_sock *tw2; struct sock *sk2; sk_for_each_bound_bhash2(sk2, &tb2->owners) { - if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) - continue; + if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok)) + return true; + } - if (inet_bind_conflict(sk, sk2, sk_uid, relax, - reuseport_cb_ok, reuseport_ok)) + twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) { + sk2 = (struct sock *)tw2; + + if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok)) return true; } + return false; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d039b4e732a3..24a38b56fab9 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -116,6 +116,7 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, #endif tb->rcv_saddr = sk->sk_rcv_saddr; INIT_HLIST_HEAD(&tb->owners); + INIT_HLIST_HEAD(&tb->deathrow); hlist_add_head(&tb->node, &head->chain); } @@ -137,7 +138,7 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { - if (hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } @@ -1103,15 +1104,16 @@ ok: /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); - spin_unlock(&head2->lock); - if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); + + spin_unlock(&head2->lock); spin_unlock(&head->lock); + if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 66fc940f9521..1d77d992e6e7 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -29,6 +29,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { + struct inet_bind2_bucket *tb2 = tw->tw_tb2; struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) @@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + + __hlist_del(&tw->tw_bind2_node); + tw->tw_tb2 = NULL; + inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); + __sock_put((struct sock *)tw); } @@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); - struct inet_bind_hashbucket *bhead; + struct inet_bind_hashbucket *bhead, *bhead2; spin_lock(lock); sk_nulls_del_node_init_rcu((struct sock *)tw); @@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, hashinfo->bhash_size)]; + bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw, + twsk_net(tw), tw->tw_num); spin_lock(&bhead->lock); + spin_lock(&bhead2->lock); inet_twsk_bind_unhash(tw, hashinfo); + spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); refcount_dec(&tw->tw_dr->tw_refcount); @@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, hlist_add_head(&tw->tw_bind_node, list); } +static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_bind2_node, list); +} + /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - struct inet_bind_hashbucket *bhead; + struct inet_bind_hashbucket *bhead, *bhead2; + /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; + bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num); + spin_lock(&bhead->lock); + spin_lock(&bhead2->lock); + tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); + + tw->tw_tb2 = icsk->icsk_bind2_hash; + WARN_ON(!icsk->icsk_bind2_hash); + inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow); + + spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); spin_lock(lock); -- cgit v1.2.3 From a2965c7be0522eaa18808684b7b82b248515511b Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 1 Jan 2023 16:57:43 -0500 Subject: net: sched: atm: dont intepret cls results when asked to drop If asked to drop a packet via TC_ACT_SHOT it is unsafe to assume res.class contains a valid pointer Fixes: b0188d4dbe5f ("[NET_SCHED]: sch_atm: Lindent") Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_atm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index f52255fea652..4a981ca90b0b 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -393,10 +393,13 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, result = tcf_classify(skb, NULL, fl, &res, true); if (result < 0) continue; + if (result == TC_ACT_SHOT) + goto done; + flow = (struct atm_flow_data *)res.class; if (!flow) flow = lookup_flow(sch, res.classid); - goto done; + goto drop; } } flow = NULL; -- cgit v1.2.3 From caa4b35b4317d5147b3ab0fbdc9c075c7d2e9c12 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 1 Jan 2023 16:57:44 -0500 Subject: net: sched: cbq: dont intepret cls results when asked to drop If asked to drop a packet via TC_ACT_SHOT it is unsafe to assume that res.class contains a valid pointer Sample splat reported by Kyle Zeng [ 5.405624] 0: reclassify loop, rule prio 0, protocol 800 [ 5.406326] ================================================================== [ 5.407240] BUG: KASAN: slab-out-of-bounds in cbq_enqueue+0x54b/0xea0 [ 5.407987] Read of size 1 at addr ffff88800e3122aa by task poc/299 [ 5.408731] [ 5.408897] CPU: 0 PID: 299 Comm: poc Not tainted 5.10.155+ #15 [ 5.409516] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 5.410439] Call Trace: [ 5.410764] dump_stack+0x87/0xcd [ 5.411153] print_address_description+0x7a/0x6b0 [ 5.411687] ? vprintk_func+0xb9/0xc0 [ 5.411905] ? printk+0x76/0x96 [ 5.412110] ? cbq_enqueue+0x54b/0xea0 [ 5.412323] kasan_report+0x17d/0x220 [ 5.412591] ? cbq_enqueue+0x54b/0xea0 [ 5.412803] __asan_report_load1_noabort+0x10/0x20 [ 5.413119] cbq_enqueue+0x54b/0xea0 [ 5.413400] ? __kasan_check_write+0x10/0x20 [ 5.413679] __dev_queue_xmit+0x9c0/0x1db0 [ 5.413922] dev_queue_xmit+0xc/0x10 [ 5.414136] ip_finish_output2+0x8bc/0xcd0 [ 5.414436] __ip_finish_output+0x472/0x7a0 [ 5.414692] ip_finish_output+0x5c/0x190 [ 5.414940] ip_output+0x2d8/0x3c0 [ 5.415150] ? ip_mc_finish_output+0x320/0x320 [ 5.415429] __ip_queue_xmit+0x753/0x1760 [ 5.415664] ip_queue_xmit+0x47/0x60 [ 5.415874] __tcp_transmit_skb+0x1ef9/0x34c0 [ 5.416129] tcp_connect+0x1f5e/0x4cb0 [ 5.416347] tcp_v4_connect+0xc8d/0x18c0 [ 5.416577] __inet_stream_connect+0x1ae/0xb40 [ 5.416836] ? local_bh_enable+0x11/0x20 [ 5.417066] ? lock_sock_nested+0x175/0x1d0 [ 5.417309] inet_stream_connect+0x5d/0x90 [ 5.417548] ? __inet_stream_connect+0xb40/0xb40 [ 5.417817] __sys_connect+0x260/0x2b0 [ 5.418037] __x64_sys_connect+0x76/0x80 [ 5.418267] do_syscall_64+0x31/0x50 [ 5.418477] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [ 5.418770] RIP: 0033:0x473bb7 [ 5.418952] Code: 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 18 89 54 24 0c 48 89 34 24 89 [ 5.420046] RSP: 002b:00007fffd20eb0f8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a [ 5.420472] RAX: ffffffffffffffda RBX: 00007fffd20eb578 RCX: 0000000000473bb7 [ 5.420872] RDX: 0000000000000010 RSI: 00007fffd20eb110 RDI: 0000000000000007 [ 5.421271] RBP: 00007fffd20eb150 R08: 0000000000000001 R09: 0000000000000004 [ 5.421671] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001 [ 5.422071] R13: 00007fffd20eb568 R14: 00000000004fc740 R15: 0000000000000002 [ 5.422471] [ 5.422562] Allocated by task 299: [ 5.422782] __kasan_kmalloc+0x12d/0x160 [ 5.423007] kasan_kmalloc+0x5/0x10 [ 5.423208] kmem_cache_alloc_trace+0x201/0x2e0 [ 5.423492] tcf_proto_create+0x65/0x290 [ 5.423721] tc_new_tfilter+0x137e/0x1830 [ 5.423957] rtnetlink_rcv_msg+0x730/0x9f0 [ 5.424197] netlink_rcv_skb+0x166/0x300 [ 5.424428] rtnetlink_rcv+0x11/0x20 [ 5.424639] netlink_unicast+0x673/0x860 [ 5.424870] netlink_sendmsg+0x6af/0x9f0 [ 5.425100] __sys_sendto+0x58d/0x5a0 [ 5.425315] __x64_sys_sendto+0xda/0xf0 [ 5.425539] do_syscall_64+0x31/0x50 [ 5.425764] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [ 5.426065] [ 5.426157] The buggy address belongs to the object at ffff88800e312200 [ 5.426157] which belongs to the cache kmalloc-128 of size 128 [ 5.426955] The buggy address is located 42 bytes to the right of [ 5.426955] 128-byte region [ffff88800e312200, ffff88800e312280) [ 5.427688] The buggy address belongs to the page: [ 5.427992] page:000000009875fabc refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0xe312 [ 5.428562] flags: 0x100000000000200(slab) [ 5.428812] raw: 0100000000000200 dead000000000100 dead000000000122 ffff888007843680 [ 5.429325] raw: 0000000000000000 0000000000100010 00000001ffffffff ffff88800e312401 [ 5.429875] page dumped because: kasan: bad access detected [ 5.430214] page->mem_cgroup:ffff88800e312401 [ 5.430471] [ 5.430564] Memory state around the buggy address: [ 5.430846] ffff88800e312180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 5.431267] ffff88800e312200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fc [ 5.431705] >ffff88800e312280: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 5.432123] ^ [ 5.432391] ffff88800e312300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fc [ 5.432810] ffff88800e312380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 5.433229] ================================================================== [ 5.433648] Disabling lock debugging due to kernel taint Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Kyle Zeng Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6568e17c4c63..36db5f6782f2 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -230,6 +230,8 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) result = tcf_classify(skb, NULL, fl, &res, true); if (!fl || result < 0) goto fallback; + if (result == TC_ACT_SHOT) + return NULL; cl = (void *)res.class; if (!cl) { @@ -250,8 +252,6 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; - case TC_ACT_SHOT: - return NULL; case TC_ACT_RECLASSIFY: return cbq_reclassify(skb, cl); } -- cgit v1.2.3 From 43d253781f6321c6a07a5fe4ee72103a679a5f6b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Jan 2023 23:17:37 -0800 Subject: net: sched: htb: fix htb_classify() kernel-doc Fix W=1 kernel-doc warning: net/sched/sch_htb.c:214: warning: expecting prototype for htb_classify(). Prototype was for HTB_DIRECT() instead by moving the HTB_DIRECT() macro above the function. Add kernel-doc notation for function parameters as well. Signed-off-by: Randy Dunlap Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index e5b4bbf3ce3d..2238edece1a4 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -199,8 +199,14 @@ static unsigned long htb_search(struct Qdisc *sch, u32 handle) { return (unsigned long)htb_find(handle, sch); } + +#define HTB_DIRECT ((struct htb_class *)-1L) + /** * htb_classify - classify a packet into class + * @skb: the socket buffer + * @sch: the active queue discipline + * @qerr: pointer for returned status code * * It returns NULL if the packet should be dropped or -1 if the packet * should be passed directly thru. In all other cases leaf class is returned. @@ -211,8 +217,6 @@ static unsigned long htb_search(struct Qdisc *sch, u32 handle) * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful * then finish and return direct queue. */ -#define HTB_DIRECT ((struct htb_class *)-1L) - static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { -- cgit v1.2.3 From a31d47be64b9b74f8cfedffe03e0a8a1f9e51f23 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 30 Dec 2022 13:24:37 +0100 Subject: netfilter: ipset: fix hash:net,port,net hang with /0 subnet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hash:net,port,net set type supports /0 subnets. However, the patch commit 5f7b51bf09baca8e titled "netfilter: ipset: Limit the maximal range of consecutive elements to add/delete" did not take into account it and resulted in an endless loop. The bug is actually older but the patch 5f7b51bf09baca8e brings it out earlier. Handle /0 subnets properly in hash:net,port,net set types. Fixes: 5f7b51bf09ba ("netfilter: ipset: Limit the maximal range of consecutive elements to add/delete") Reported-by: Марк Коренберг Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_netportnet.c | 40 +++++++++++++++------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 19bcdb3141f6..005a7ce87217 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -173,17 +173,26 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } +static u32 +hash_netportnet4_range_to_cidr(u32 from, u32 to, u8 *cidr) +{ + if (from == 0 && to == UINT_MAX) { + *cidr = 0; + return to; + } + return ip_set_range_to_cidr(from, to, cidr); +} + static int hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netportnet4 *h = set->data; + struct hash_netportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2, ipn; - u64 n = 0, m = 0; + u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; int ret; @@ -285,19 +294,6 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); - n++; - } while (ipn++ < ip_to); - ipn = ip2_from; - do { - ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); - m++; - } while (ipn++ < ip2_to); - - if (n*m*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); @@ -310,13 +306,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], do { e.ip[0] = htonl(ip); - ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); + ip = hash_netportnet4_range_to_cidr(ip, ip_to, &e.cidr[0]); for (; p <= port_to; p++) { e.port = htons(p); do { + i++; e.ip[1] = htonl(ip2); - ip2 = ip_set_range_to_cidr(ip2, ip2_to, - &e.cidr[1]); + if (i > IPSET_MAX_RANGE) { + hash_netportnet4_data_next(&h->next, + &e); + return -ERANGE; + } + ip2 = hash_netportnet4_range_to_cidr(ip2, + ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; -- cgit v1.2.3 From 5e29dc36bd5e2166b834ceb19990d9e68a734d7d Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 30 Dec 2022 13:24:38 +0100 Subject: netfilter: ipset: Rework long task execution when adding/deleting entries When adding/deleting large number of elements in one step in ipset, it can take a reasonable amount of time and can result in soft lockup errors. The patch 5f7b51bf09ba ("netfilter: ipset: Limit the maximal range of consecutive elements to add/delete") tried to fix it by limiting the max elements to process at all. However it was not enough, it is still possible that we get hung tasks. Lowering the limit is not reasonable, so the approach in this patch is as follows: rely on the method used at resizing sets and save the state when we reach a smaller internal batch limit, unlock/lock and proceed from the saved state. Thus we can avoid long continuous tasks and at the same time removed the limit to add/delete large number of elements in one step. The nfnl mutex is held during the whole operation which prevents one to issue other ipset commands in parallel. Fixes: 5f7b51bf09ba ("netfilter: ipset: Limit the maximal range of consecutive elements to add/delete") Reported-by: syzbot+9204e7399656300bf271@syzkaller.appspotmail.com Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 7 ++++--- net/netfilter/ipset/ip_set_hash_ip.c | 14 +++++++------- net/netfilter/ipset/ip_set_hash_ipmark.c | 13 +++++++------ net/netfilter/ipset/ip_set_hash_ipport.c | 13 +++++++------ net/netfilter/ipset/ip_set_hash_ipportip.c | 13 +++++++------ net/netfilter/ipset/ip_set_hash_ipportnet.c | 13 ++++++++----- net/netfilter/ipset/ip_set_hash_net.c | 17 +++++++---------- net/netfilter/ipset/ip_set_hash_netiface.c | 15 ++++++--------- net/netfilter/ipset/ip_set_hash_netnet.c | 23 +++++++---------------- net/netfilter/ipset/ip_set_hash_netport.c | 19 +++++++------------ 10 files changed, 67 insertions(+), 80 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e7ba5b6dd2b7..46ebee9400da 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1698,9 +1698,10 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); retried = true; - } while (ret == -EAGAIN && - set->variant->resize && - (ret = set->variant->resize(set, retried)) == 0); + } while (ret == -ERANGE || + (ret == -EAGAIN && + set->variant->resize && + (ret = set->variant->resize(set, retried)) == 0)); if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) return 0; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index e30513cefd90..c9f4e3859663 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -100,11 +100,11 @@ static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ip4 *h = set->data; + struct hash_ip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, hosts; + u32 ip = 0, ip_to = 0, hosts, i = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) @@ -149,14 +149,14 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); - /* 64bit division is not allowed on 32bit */ - if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to;) { + for (; ip <= ip_to; i++) { e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_ip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 153de3457423..a22ec1a6f6ec 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -97,11 +97,11 @@ static int hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipmark4 *h = set->data; + struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0; + u32 ip, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -148,13 +148,14 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - if (((u64)ip_to - ip + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to; ip++, i++) { e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_ipmark4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 2ffbd0b78a8c..e977b5a9c48d 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -112,11 +112,11 @@ static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipport4 *h = set->data; + struct hash_ipport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -184,17 +184,18 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 334fb1ad0e86..39a01934b153 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -108,11 +108,11 @@ static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportip4 *h = set->data; + struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -180,17 +180,18 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipportip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7df94f437f60..5c6de605a9fb 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -160,12 +160,12 @@ static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportnet4 *h = set->data; + struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -253,9 +253,6 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); @@ -282,9 +279,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], for (; p <= port_to; p++) { e.port = htons(p); do { + i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; + if (i > IPSET_MAX_RANGE) { + hash_ipportnet4_data_next(&h->next, + &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 1422739d9aa2..ce0a9ce5a91f 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -136,11 +136,11 @@ static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_net4 *h = set->data; + struct hash_net4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, ipn, n = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -188,19 +188,16 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); - n++; - } while (ipn++ < ip_to); - - if (n > IPSET_MAX_RANGE) - return -ERANGE; if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_net4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 9810f5bf63f5..031073286236 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -202,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, ipn, n = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -256,19 +256,16 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); - n++; - } while (ipn++ < ip_to); - - if (n > IPSET_MAX_RANGE) - return -ERANGE; if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_netiface4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index cdfb78c6e0d3..8fbe649c9dd3 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -166,13 +166,12 @@ static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netnet4 *h = set->data; + struct hash_netnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0; - u32 ip2 = 0, ip2_from = 0, ip2_to = 0, ipn; - u64 n = 0, m = 0; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -248,19 +247,6 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); - n++; - } while (ipn++ < ip_to); - ipn = ip2_from; - do { - ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); - m++; - } while (ipn++ < ip2_to); - - if (n*m > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); @@ -273,7 +259,12 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip[0] = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); do { + i++; e.ip[1] = htonl(ip2); + if (i > IPSET_MAX_RANGE) { + hash_netnet4_data_next(&h->next, &e); + return -ERANGE; + } ip2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 09cf72eb37f8..d1a0628df4ef 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -154,12 +154,11 @@ static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netport4 *h = set->data; + struct hash_netport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 port, port_to, p = 0, ip = 0, ip_to = 0, ipn; - u64 n = 0; + u32 port, port_to, p = 0, ip = 0, ip_to = 0, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -236,14 +235,6 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &cidr); - n++; - } while (ipn++ < ip_to); - - if (n*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip); @@ -255,8 +246,12 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_netport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; -- cgit v1.2.3 From 2c02d41d71f90a5168391b6a5f2954112ba2307c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 3 Jan 2023 12:19:17 +0100 Subject: net/ulp: prevent ULP without clone op from entering the LISTEN status When an ULP-enabled socket enters the LISTEN status, the listener ULP data pointer is copied inside the child/accepted sockets by sk_clone_lock(). The relevant ULP can take care of de-duplicating the context pointer via the clone() operation, but only MPTCP and SMC implement such op. Other ULPs may end-up with a double-free at socket disposal time. We can't simply clear the ULP data at clone time, as TLS replaces the socket ops with custom ones assuming a valid TLS ULP context is available. Instead completely prevent clone-less ULP sockets from entering the LISTEN status. Fixes: 734942cc4ea6 ("tcp: ULP infrastructure") Reported-by: slipper Signed-off-by: Paolo Abeni Link: https://lore.kernel.org/r/4b80c3d1dbe3d0ab072f80450c202d9bc88b4b03.1672740602.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 14 ++++++++++++++ net/ipv4/tcp_ulp.c | 4 ++++ 2 files changed, 18 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 848ffc3e0239..d1f837579398 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1200,12 +1200,26 @@ void inet_csk_prepare_forced_close(struct sock *sk) } EXPORT_SYMBOL(inet_csk_prepare_forced_close); +static int inet_ulp_can_listen(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone) + return -EINVAL; + + return 0; +} + int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); int err; + err = inet_ulp_can_listen(sk); + if (unlikely(err)) + return err; + reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_ack_backlog = 0; diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 9ae50b1bd844..05b6077b9f2c 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -139,6 +139,10 @@ static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); + err = -EINVAL; + if (!ulp_ops->clone && sk->sk_state == TCP_LISTEN) + goto out_err; + err = ulp_ops->init(sk); if (err) goto out_err; -- cgit v1.2.3 From 1ac88557447088ccd15eb2f2520ce46d463c8e0b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Jan 2023 19:27:36 +0000 Subject: inet: control sockets should not use current thread task_frag Because ICMP handlers run from softirq contexts, they must not use current thread task_frag. Previously, all sockets allocated by inet_ctl_sock_create() would use the per-socket page fragment, with no chance of recursion. Fixes: 98123866fcf3 ("Treewide: Stop corrupting socket's task_frag") Reported-by: syzbot+bebc6f1acdf4cbb79b03@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Benjamin Coddington Acked-by: Guillaume Nault Link: https://lore.kernel.org/r/20230103192736.454149-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/af_inet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ab4a06be489b..6c0ec2789943 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1665,6 +1665,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, if (rc == 0) { *sk = sock->sk; (*sk)->sk_allocation = GFP_ATOMIC; + (*sk)->sk_use_task_frag = false; /* * Unhash it so that IP input processing does not even see it, * we do not wish this socket to see incoming packets. -- cgit v1.2.3 From fe69230f05897b3de758427b574fc98025dfc907 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 4 Jan 2023 14:51:46 +0800 Subject: caif: fix memory leak in cfctrl_linkup_request() When linktype is unknown or kzalloc failed in cfctrl_linkup_request(), pkt is not released. Add release process to error path. Fixes: b482cd2053e3 ("net-caif: add CAIF core protocol stack") Fixes: 8d545c8f958f ("caif: Disconnect without waiting for response") Signed-off-by: Zhengchao Shao Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230104065146.1153009-1-shaozhengchao@huawei.com Signed-off-by: Paolo Abeni --- net/caif/cfctrl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index cc405d8c7c30..8480684f2762 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -269,11 +269,15 @@ int cfctrl_linkup_request(struct cflayer *layer, default: pr_warn("Request setup of bad link type = %d\n", param->linktype); + cfpkt_destroy(pkt); return -EINVAL; } req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) + if (!req) { + cfpkt_destroy(pkt); return -ENOMEM; + } + req->client_layer = user_layer; req->cmd = CFCTRL_CMD_LINK_SETUP; req->param = *param; -- cgit v1.2.3 From 8a758d98dba380a7d32a98b0840ad707e3036233 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 23:17:06 +0100 Subject: rxrpc: Stash the network namespace pointer in rxrpc_local Stash the network namespace pointer in the rxrpc_local struct in addition to a pointer to the rxrpc-specific net namespace info. Use this to remove some places where the socket is passed as a parameter. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 10 +++++----- net/rxrpc/call_accept.c | 2 +- net/rxrpc/conn_client.c | 2 +- net/rxrpc/local_object.c | 7 ++++--- net/rxrpc/peer_object.c | 23 ++++++++++------------- 5 files changed, 21 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 18092526d3c8..9cf763c338bc 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -283,7 +283,8 @@ struct rxrpc_local { struct rcu_head rcu; atomic_t active_users; /* Number of users of the local endpoint */ refcount_t ref; /* Number of references to the structure */ - struct rxrpc_net *rxnet; /* The network ns in which this resides */ + struct net *net; /* The network namespace */ + struct rxrpc_net *rxnet; /* Our bits in the network namespace */ struct hlist_node link; struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; @@ -1063,12 +1064,11 @@ void rxrpc_peer_keepalive_worker(struct work_struct *); */ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, const struct sockaddr_rxrpc *); -struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *, struct rxrpc_local *, - struct sockaddr_rxrpc *, gfp_t); +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, + struct sockaddr_rxrpc *srx, gfp_t gfp); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t, enum rxrpc_peer_trace); -void rxrpc_new_incoming_peer(struct rxrpc_sock *, struct rxrpc_local *, - struct rxrpc_peer *); +void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer); void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *, enum rxrpc_peer_trace); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index c02401656fa9..c957e4415cdc 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -280,7 +280,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, (peer_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); - rxrpc_new_incoming_peer(rx, local, peer); + rxrpc_new_incoming_peer(local, peer); } /* Now allocate and set up the connection */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 87efa0373aed..e4063c4f4bb2 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -378,7 +378,7 @@ static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx, _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp); + cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); if (!cp->peer) goto error; diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 270b63d8f37a..c0ac2fe07ec4 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -85,7 +85,7 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local, /* * Allocate a new local endpoint. */ -static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, +static struct rxrpc_local *rxrpc_alloc_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; @@ -94,7 +94,8 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, if (local) { refcount_set(&local->ref, 1); atomic_set(&local->active_users, 1); - local->rxnet = rxnet; + local->net = net; + local->rxnet = rxrpc_net(net); INIT_HLIST_NODE(&local->link); init_rwsem(&local->defrag_sem); init_completion(&local->io_thread_ready); @@ -248,7 +249,7 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net, goto found; } - local = rxrpc_alloc_local(rxnet, srx); + local = rxrpc_alloc_local(net, srx); if (!local) goto nomem; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 4eecea2be307..8d7a715a0bb1 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -147,10 +147,10 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, * assess the MTU size for the network interface through which this peer is * reached */ -static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, +static void rxrpc_assess_MTU_size(struct rxrpc_local *local, struct rxrpc_peer *peer) { - struct net *net = sock_net(&rx->sk); + struct net *net = local->net; struct dst_entry *dst; struct rtable *rt; struct flowi fl; @@ -236,11 +236,11 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, /* * Initialise peer record. */ -static void rxrpc_init_peer(struct rxrpc_sock *rx, struct rxrpc_peer *peer, +static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; - rxrpc_assess_MTU_size(rx, peer); + rxrpc_assess_MTU_size(local, peer); peer->mtu = peer->if_mtu; peer->rtt_last_req = ktime_get_real(); @@ -272,8 +272,7 @@ static void rxrpc_init_peer(struct rxrpc_sock *rx, struct rxrpc_peer *peer, /* * Set up a new peer. */ -static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, - struct rxrpc_local *local, +static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, unsigned long hash_key, gfp_t gfp) @@ -285,7 +284,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, peer = rxrpc_alloc_peer(local, gfp, rxrpc_peer_new_client); if (peer) { memcpy(&peer->srx, srx, sizeof(*srx)); - rxrpc_init_peer(rx, peer, hash_key); + rxrpc_init_peer(local, peer, hash_key); } _leave(" = %p", peer); @@ -304,14 +303,13 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer) * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. */ -void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, - struct rxrpc_peer *peer) +void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; hash_key = rxrpc_peer_hash_key(local, &peer->srx); - rxrpc_init_peer(rx, peer, hash_key); + rxrpc_init_peer(local, peer, hash_key); spin_lock(&rxnet->peer_hash_lock); hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); @@ -322,8 +320,7 @@ void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, /* * obtain a remote transport endpoint for the specified address */ -struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, - struct rxrpc_local *local, +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_peer *peer, *candidate; @@ -343,7 +340,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, /* The peer is not yet present in hash - create a candidate * for a new record and then redo the search. */ - candidate = rxrpc_create_peer(rx, local, srx, hash_key, gfp); + candidate = rxrpc_create_peer(local, srx, hash_key, gfp); if (!candidate) { _leave(" = NULL [nomem]"); return NULL; -- cgit v1.2.3 From 5040011d073d3acdeb58af2b64f84e33bb03abd2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 2 Nov 2022 10:24:29 +0000 Subject: rxrpc: Make the local endpoint hold a ref on a connected call Make the local endpoint and it's I/O thread hold a reference on a connected call until that call is disconnected. Without this, we're reliant on either the AF_RXRPC socket to hold a ref (which is dropped when the call is released) or a queued work item to hold a ref (the work item is being replaced with the I/O thread). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/call_object.c | 2 ++ net/rxrpc/conn_client.c | 6 +++--- net/rxrpc/conn_object.c | 25 +++++++++++++++---------- 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 89dcf60b1158..239fc3c75079 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -453,6 +453,8 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, BUG(); } + rxrpc_get_call(call, rxrpc_call_get_io_thread); + /* Set the channel for this call. We don't get channel_lock as we're * only defending against the data_ready handler (which we're called * from) and the RESPONSE packet parser (which is only really diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index e4063c4f4bb2..1edd65883c55 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -725,8 +725,11 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper); + rxrpc_get_call(call, rxrpc_call_get_io_thread); + bundle = rxrpc_prep_call(rx, call, cp, srx, gfp); if (IS_ERR(bundle)) { + rxrpc_put_call(call, rxrpc_call_get_io_thread); ret = PTR_ERR(bundle); goto out; } @@ -820,7 +823,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call _enter("c=%x", call->debug_id); spin_lock(&bundle->channel_lock); - set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); /* Calls that have never actually been assigned a channel can simply be * discarded. @@ -912,8 +914,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call out: spin_unlock(&bundle->channel_lock); - _leave(""); - return; } /* diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 3c8f83dacb2b..2bd3f6288895 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -178,6 +178,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; + set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); + rxrpc_see_call(call, rxrpc_call_see_disconnected); + call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { @@ -186,18 +189,20 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) spin_unlock(&call->peer->lock); } - if (rxrpc_is_client_call(call)) - return rxrpc_disconnect_client_call(conn->bundle, call); + if (rxrpc_is_client_call(call)) { + rxrpc_disconnect_client_call(conn->bundle, call); + } else { + spin_lock(&conn->bundle->channel_lock); + __rxrpc_disconnect_call(conn, call); + spin_unlock(&conn->bundle->channel_lock); - spin_lock(&conn->bundle->channel_lock); - __rxrpc_disconnect_call(conn, call); - spin_unlock(&conn->bundle->channel_lock); + conn->idle_timestamp = jiffies; + if (atomic_dec_and_test(&conn->active)) + rxrpc_set_service_reap_timer(conn->rxnet, + jiffies + rxrpc_connection_expiry); + } - set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); - conn->idle_timestamp = jiffies; - if (atomic_dec_and_test(&conn->active)) - rxrpc_set_service_reap_timer(conn->rxnet, - jiffies + rxrpc_connection_expiry); + rxrpc_put_call(call, rxrpc_call_put_io_thread); } /* -- cgit v1.2.3 From 30df927b936b2ef21eb07dce9c141c7897609643 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 8 Oct 2022 14:33:50 +0100 Subject: rxrpc: Separate call retransmission from other conn events Call the rxrpc_conn_retransmit_call() directly from rxrpc_input_packet() rather than calling it via connection event handling. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/conn_event.c | 34 +++++----------------------------- net/rxrpc/io_thread.c | 2 +- 3 files changed, 8 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9cf763c338bc..f3b8806e7241 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -897,6 +897,8 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *); /* * conn_event.c */ +void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int channel); void rxrpc_process_connection(struct work_struct *); void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 480364bcbf85..dfd29882126f 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -19,9 +19,9 @@ /* * Retransmit terminal ACK or ABORT of the previous call. */ -static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, - struct sk_buff *skb, - unsigned int channel) +void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, + struct sk_buff *skb, + unsigned int channel) { struct rxrpc_skb_priv *sp = skb ? rxrpc_skb(skb) : NULL; struct rxrpc_channel *chan; @@ -292,24 +292,6 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_DATA: - case RXRPC_PACKET_TYPE_ACK: - rxrpc_conn_retransmit_call(conn, skb, - sp->hdr.cid & RXRPC_CHANNELMASK); - return 0; - - case RXRPC_PACKET_TYPE_BUSY: - /* Just ignore BUSY packets for now. */ - return 0; - - case RXRPC_PACKET_TYPE_ABORT: - conn->error = -ECONNABORTED; - conn->abort_code = skb->priority; - conn->state = RXRPC_CONN_REMOTELY_ABORTED; - set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); - return -ECONNABORTED; - case RXRPC_PACKET_TYPE_CHALLENGE: return conn->security->respond_to_challenge(conn, skb, _abort_code); @@ -504,18 +486,12 @@ int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { _leave(" = -ECONNABORTED [%u]", conn->state); - return -ECONNABORTED; + return 0; } _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_DATA: - case RXRPC_PACKET_TYPE_ACK: - rxrpc_conn_retransmit_call(conn, skb, - sp->hdr.cid & RXRPC_CHANNELMASK); - return 0; - case RXRPC_PACKET_TYPE_BUSY: /* Just ignore BUSY packets for now. */ return 0; @@ -526,7 +502,7 @@ int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) conn->state = RXRPC_CONN_REMOTELY_ABORTED; set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); - return -ECONNABORTED; + return 0; case RXRPC_PACKET_TYPE_CHALLENGE: case RXRPC_PACKET_TYPE_RESPONSE: diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 1ad067d66fb6..0e1a548d35f8 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -358,7 +358,7 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, sp->hdr.seq, sp->hdr.serial, sp->hdr.flags); - rxrpc_input_conn_packet(conn, skb); + rxrpc_conn_retransmit_call(conn, skb, channel); return 0; } -- cgit v1.2.3 From a343b174b4bdde851033996960bca5ad1394d04b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Oct 2022 22:17:56 +0100 Subject: rxrpc: Only set/transmit aborts in the I/O thread Only set the abort call completion state in the I/O thread and only transmit ABORT packets from there. rxrpc_abort_call() can then be made to actually send the packet. Further, ABORT packets should only be sent if the call has been exposed to the network (ie. at least one attempted DATA transmission has occurred for it). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 7 ++++++- net/rxrpc/call_event.c | 16 +++++++++++++--- net/rxrpc/call_object.c | 7 ++++--- net/rxrpc/input.c | 6 ++---- net/rxrpc/recvmsg.c | 2 ++ net/rxrpc/sendmsg.c | 29 ++++++++++++++++++++++------- 6 files changed, 49 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f3b8806e7241..0cf28a56aec5 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -625,7 +625,10 @@ struct rxrpc_call { unsigned long events; spinlock_t notify_lock; /* Kernel notification lock */ rwlock_t state_lock; /* lock for state transition */ - u32 abort_code; /* Local/remote abort code */ + const char *send_abort_why; /* String indicating why the abort was sent */ + s32 send_abort; /* Abort code to be sent */ + short send_abort_err; /* Error to be associated with the abort */ + s32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ enum rxrpc_call_state state; /* current state of call */ enum rxrpc_call_completion completion; /* Call completion condition */ @@ -1146,6 +1149,8 @@ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *, /* * sendmsg.c */ +bool rxrpc_propose_abort(struct rxrpc_call *call, + u32 abort_code, int error, const char *why); int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index b2cf448fb02c..b7efecf5ccfc 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -270,9 +270,11 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) { struct rxrpc_txbuf *txb; - if (rxrpc_is_client_call(call) && - !test_bit(RXRPC_CALL_EXPOSED, &call->flags)) + if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { + if (list_empty(&call->tx_sendmsg)) + return; rxrpc_expose_client_call(call); + } while ((txb = list_first_entry_or_null(&call->tx_sendmsg, struct rxrpc_txbuf, call_link))) { @@ -336,6 +338,7 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) unsigned long now, next, t; rxrpc_serial_t ackr_serial; bool resend = false, expired = false; + s32 abort_code; rxrpc_see_call(call, rxrpc_call_see_input); @@ -346,6 +349,14 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) if (call->state == RXRPC_CALL_COMPLETE) goto out; + /* Handle abort request locklessly, vs rxrpc_propose_abort(). */ + abort_code = smp_load_acquire(&call->send_abort); + if (abort_code) { + rxrpc_abort_call(call->send_abort_why, call, 0, call->send_abort, + call->send_abort_err); + goto out; + } + if (skb && skb->mark == RXRPC_SKB_MARK_ERROR) goto out; @@ -433,7 +444,6 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } else { rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, -ETIME); } - rxrpc_send_abort_packet(call); goto out; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 239fc3c75079..298b7c465d7e 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -430,6 +430,8 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->state = RXRPC_CALL_SERVER_SECURING; call->cong_tstamp = skb->tstamp; + __set_bit(RXRPC_CALL_EXPOSED, &call->flags); + spin_lock(&conn->state_lock); switch (conn->state) { @@ -590,7 +592,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); list_del(&call->accept_link); - rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, -ECONNRESET); + rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, "SKR"); rxrpc_put_call(call, rxrpc_call_put_release_sock_tba); } @@ -598,8 +600,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) call = list_entry(rx->sock_calls.next, struct rxrpc_call, sock_link); rxrpc_get_call(call, rxrpc_call_get_release_sock); - rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, -ECONNRESET); - rxrpc_send_abort_packet(call); + rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, "SKT"); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put_release_sock); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index d0e20e946e48..1f03a286620d 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -12,8 +12,7 @@ static void rxrpc_proto_abort(const char *why, struct rxrpc_call *call, rxrpc_seq_t seq) { - if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG)) - rxrpc_send_abort_packet(call); + rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG); } /* @@ -1007,8 +1006,7 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_CALL_COMPLETE: break; default: - if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN)) - rxrpc_send_abort_packet(call); + rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN); trace_rxrpc_improper_term(call); break; } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 6ebd6440a2b7..a4ccdc006d0f 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -134,6 +134,8 @@ bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, write_lock(&call->state_lock); ret = __rxrpc_abort_call(why, call, seq, abort_code, error); write_unlock(&call->state_lock); + if (ret && test_bit(RXRPC_CALL_EXPOSED, &call->flags)) + rxrpc_send_abort_packet(call); return ret; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index cde1e65f16b4..dc3c2a834fc8 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -17,6 +17,26 @@ #include #include "ar-internal.h" +/* + * Propose an abort to be made in the I/O thread. + */ +bool rxrpc_propose_abort(struct rxrpc_call *call, + u32 abort_code, int error, const char *why) +{ + _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); + + if (!call->send_abort && call->state < RXRPC_CALL_COMPLETE) { + call->send_abort_why = why; + call->send_abort_err = error; + /* Request abort locklessly vs rxrpc_input_call_event(). */ + smp_store_release(&call->send_abort, abort_code); + rxrpc_poke_call(call, rxrpc_call_poke_abort); + return true; + } + + return false; +} + /* * Return true if there's sufficient Tx queue space. */ @@ -663,9 +683,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) /* it's too late for this call */ ret = -ESHUTDOWN; } else if (p.command == RXRPC_CMD_SEND_ABORT) { + rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, "CMD"); ret = 0; - if (rxrpc_abort_call("CMD", call, 0, p.abort_code, -ECONNABORTED)) - ret = rxrpc_send_abort_packet(call); } else if (p.command != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; } else { @@ -760,11 +779,7 @@ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); mutex_lock(&call->user_mutex); - - aborted = rxrpc_abort_call(why, call, 0, abort_code, error); - if (aborted) - rxrpc_send_abort_packet(call); - + aborted = rxrpc_propose_abort(call, abort_code, error, why); mutex_unlock(&call->user_mutex); return aborted; } -- cgit v1.2.3 From 03fc55adf8761c546d72798264b019c9f672c578 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Oct 2022 17:01:25 +0100 Subject: rxrpc: Only disconnect calls in the I/O thread Only perform call disconnection in the I/O thread to reduce the locking requirement. This is the first part of a fix for a race that exists between call connection and call disconnection whereby the data transmission code adds the call to the peer error distribution list after the call has been disconnected (say by the rxrpc socket getting closed). The fix is to complete the process of moving call connection, data transmission and call disconnection into the I/O thread and thus forcibly serialising them. Note that the issue may predate the overhaul to an I/O thread model that were included in the merge window for v6.2, but the timing is very much changed by the change given below. Fixes: cf37b5987508 ("rxrpc: Move DATA transmission into call processor work item") Reported-by: syzbot+c22650d2844392afdcfd@syzkaller.appspotmail.com Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/call_event.c | 7 ++++++- net/rxrpc/call_object.c | 9 +-------- net/rxrpc/input.c | 6 ------ net/rxrpc/recvmsg.c | 1 + 4 files changed, 8 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index b7efecf5ccfc..b2fc3fa686ec 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -484,8 +484,13 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } out: - if (call->state == RXRPC_CALL_COMPLETE) + if (call->state == RXRPC_CALL_COMPLETE) { del_timer_sync(&call->timer); + if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) + rxrpc_disconnect_call(call); + if (call->security) + call->security->free_call_crypto(call); + } if (call->acks_hard_ack != call->tx_bottom) rxrpc_shrink_call_tx_buffer(call); _leave(""); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 298b7c465d7e..13aac3ca03a0 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -50,7 +50,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) struct rxrpc_local *local = call->local; bool busy; - if (call->state < RXRPC_CALL_COMPLETE) { + if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { spin_lock_bh(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); @@ -533,13 +533,10 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), call->flags, rxrpc_call_see_release); - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); - if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); rxrpc_put_call_slot(call); - del_timer_sync(&call->timer); /* Make sure we don't get any more notifications */ write_lock(&rx->recvmsg_lock); @@ -572,10 +569,6 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); - if (conn && !test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) - rxrpc_disconnect_call(call); - if (call->security) - call->security->free_call_crypto(call); _leave(""); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 1f03a286620d..bb4beb445325 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -997,8 +997,6 @@ void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb) */ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) { - struct rxrpc_connection *conn = call->conn; - switch (READ_ONCE(call->state)) { case RXRPC_CALL_SERVER_AWAIT_ACK: rxrpc_call_completed(call); @@ -1012,8 +1010,4 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) } rxrpc_input_call_event(call, skb); - - spin_lock(&conn->bundle->channel_lock); - __rxrpc_disconnect_call(conn, call); - spin_unlock(&conn->bundle->channel_lock); } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a4ccdc006d0f..8d5fe65f5951 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -201,6 +201,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) case RXRPC_CALL_CLIENT_RECV_REPLY: __rxrpc_call_completed(call); write_unlock(&call->state_lock); + rxrpc_poke_call(call, rxrpc_call_poke_complete); break; case RXRPC_CALL_SERVER_RECV_REQUEST: -- cgit v1.2.3 From f2cce89a074e6d2991dddc94f6b6ebe1576b8459 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 09:08:34 +0100 Subject: rxrpc: Implement a mechanism to send an event notification to a connection Provide a means by which an event notification can be sent to a connection through such that the I/O thread can pick it up and handle it rather than doing it in a separate workqueue. This is then used to move the deferred final ACK of a call into the I/O thread rather than a separate work queue as part of the drive to do all transmission from the I/O thread. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 5 +++++ net/rxrpc/conn_event.c | 14 ++++++++++---- net/rxrpc/conn_object.c | 20 +++++++++++++++++++- net/rxrpc/io_thread.c | 19 ++++++++++++++++++- net/rxrpc/local_object.c | 1 + 5 files changed, 53 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 0cf28a56aec5..d82d7f36cdaa 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -202,6 +202,7 @@ struct rxrpc_host_header { * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { + struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ u16 offset; /* Offset of data */ u16 len; /* Length of data */ u8 flags; @@ -292,6 +293,7 @@ struct rxrpc_local { struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head rx_queue; /* Received packets */ + struct list_head conn_attend_q; /* Conns requiring immediate attention */ struct list_head call_attend_q; /* Calls requiring immediate attention */ struct rb_root client_bundles; /* Client connection bundles by socket params */ spinlock_t client_bundles_lock; /* Lock for client_bundles */ @@ -441,6 +443,7 @@ struct rxrpc_connection { struct rxrpc_peer *peer; /* Remote endpoint */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ struct key *key; /* Security details */ + struct list_head attend_link; /* Link in local->conn_attend_q */ refcount_t ref; atomic_t active; /* Active count for service conns */ @@ -905,6 +908,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct sk_buff *s void rxrpc_process_connection(struct work_struct *); void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); +void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb); /* * conn_object.c @@ -912,6 +916,7 @@ int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); extern unsigned int rxrpc_connection_expiry; extern unsigned int rxrpc_closed_conn_expiry; +void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why); struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *, gfp_t); struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *, struct sockaddr_rxrpc *, diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index dfd29882126f..7a980a32344f 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -412,10 +412,6 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn) if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); - /* Process delayed ACKs whose time has come. */ - if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) - rxrpc_process_delayed_final_acks(conn, false); - /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { @@ -515,3 +511,13 @@ int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) return -EPROTO; } } + +/* + * Input a connection event. + */ +void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + /* Process delayed ACKs whose time has come. */ + if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) + rxrpc_process_delayed_final_acks(conn, false); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 2bd3f6288895..281f59e356f5 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -23,12 +23,30 @@ static void rxrpc_clean_up_connection(struct work_struct *work); static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, unsigned long reap_at); +void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) +{ + struct rxrpc_local *local = conn->local; + bool busy; + + if (WARN_ON_ONCE(!local)) + return; + + spin_lock_bh(&local->lock); + busy = !list_empty(&conn->attend_link); + if (!busy) { + rxrpc_get_connection(conn, why); + list_add_tail(&conn->attend_link, &local->conn_attend_q); + } + spin_unlock_bh(&local->lock); + rxrpc_wake_up_io_thread(local); +} + static void rxrpc_connection_timer(struct timer_list *timer) { struct rxrpc_connection *conn = container_of(timer, struct rxrpc_connection, timer); - rxrpc_queue_conn(conn, rxrpc_conn_queue_timer); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_timer); } /* diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 0e1a548d35f8..46e58cf5bc96 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -421,6 +421,7 @@ reject_packet: */ int rxrpc_io_thread(void *data) { + struct rxrpc_connection *conn; struct sk_buff_head rx_queue; struct rxrpc_local *local = data; struct rxrpc_call *call; @@ -436,6 +437,20 @@ int rxrpc_io_thread(void *data) for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); + /* Deal with connections that want immediate attention. */ + conn = list_first_entry_or_null(&local->conn_attend_q, + struct rxrpc_connection, + attend_link); + if (conn) { + spin_lock_bh(&local->lock); + list_del_init(&conn->attend_link); + spin_unlock_bh(&local->lock); + + rxrpc_input_conn_event(conn, NULL); + rxrpc_put_connection(conn, rxrpc_conn_put_poke); + continue; + } + /* Deal with calls that want immediate attention. */ if ((call = list_first_entry_or_null(&local->call_attend_q, struct rxrpc_call, @@ -463,6 +478,7 @@ int rxrpc_io_thread(void *data) rxrpc_input_error(local, skb); rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; + break; default: WARN_ON_ONCE(1); rxrpc_free_skb(skb, rxrpc_skb_put_unknown); @@ -481,7 +497,8 @@ int rxrpc_io_thread(void *data) set_current_state(TASK_INTERRUPTIBLE); should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || - !list_empty(&local->call_attend_q)) { + !list_empty(&local->call_attend_q) || + !list_empty(&local->conn_attend_q)) { __set_current_state(TASK_RUNNING); continue; } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index c0ac2fe07ec4..8ef6cd8defa4 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -100,6 +100,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, init_rwsem(&local->defrag_sem); init_completion(&local->io_thread_ready); skb_queue_head_init(&local->rx_queue); + INIT_LIST_HEAD(&local->conn_attend_q); INIT_LIST_HEAD(&local->call_attend_q); local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); -- cgit v1.2.3 From a00ce28b1778fa3576575b43bdb17f60ded38b66 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 09:56:36 +0100 Subject: rxrpc: Clean up connection abort Clean up connection abort, using the connection state_lock to gate access to change that state, and use an rxrpc_call_completion value to indicate the difference between local and remote aborts as these can be pasted directly into the call state. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 46 ++++++---- net/rxrpc/call_object.c | 8 +- net/rxrpc/conn_event.c | 233 +++++++++++++++++------------------------------- net/rxrpc/insecure.c | 18 +--- net/rxrpc/output.c | 56 ++++++++++++ net/rxrpc/proc.c | 10 ++- net/rxrpc/rxkad.c | 28 +++--- 7 files changed, 186 insertions(+), 213 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d82d7f36cdaa..78bd6fb0bc15 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -263,13 +263,11 @@ struct rxrpc_security { /* respond to a challenge */ int (*respond_to_challenge)(struct rxrpc_connection *, - struct sk_buff *, - u32 *); + struct sk_buff *); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, - struct sk_buff *, - u32 *); + struct sk_buff *); /* clear connection security */ void (*clear)(struct rxrpc_connection *); @@ -367,6 +365,18 @@ struct rxrpc_conn_parameters { u32 security_level; /* Security level selected */ }; +/* + * Call completion condition (state == RXRPC_CALL_COMPLETE). + */ +enum rxrpc_call_completion { + RXRPC_CALL_SUCCEEDED, /* - Normal termination */ + RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ + RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ + RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ + RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ + NR__RXRPC_CALL_COMPLETIONS +}; + /* * Bits in the connection flags. */ @@ -391,6 +401,7 @@ enum rxrpc_conn_flag { */ enum rxrpc_conn_event { RXRPC_CONN_EV_CHALLENGE, /* Send challenge packet */ + RXRPC_CONN_EV_ABORT_CALLS, /* Abort attached calls */ }; /* @@ -403,8 +414,7 @@ enum rxrpc_conn_proto_state { RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ RXRPC_CONN_SERVICE, /* Service secured connection */ - RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */ - RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */ + RXRPC_CONN_ABORTED, /* Conn aborted */ RXRPC_CONN__NR_STATES }; @@ -487,7 +497,8 @@ struct rxrpc_connection { unsigned long idle_timestamp; /* Time at which last became idle */ spinlock_t state_lock; /* state-change lock */ enum rxrpc_conn_proto_state state; /* current state of connection */ - u32 abort_code; /* Abort code of connection abort */ + enum rxrpc_call_completion completion; /* Completion condition */ + s32 abort_code; /* Abort code of connection abort */ int debug_id; /* debug ID for printks */ atomic_t serial; /* packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ @@ -561,18 +572,6 @@ enum rxrpc_call_state { NR__RXRPC_CALL_STATES }; -/* - * Call completion condition (state == RXRPC_CALL_COMPLETE). - */ -enum rxrpc_call_completion { - RXRPC_CALL_SUCCEEDED, /* - Normal termination */ - RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ - RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ - RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ - RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ - NR__RXRPC_CALL_COMPLETIONS -}; - /* * Call Tx congestion management modes. */ @@ -905,11 +904,19 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *); */ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct sk_buff *skb, unsigned int channel); +int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, + s32 abort_code, int err, const char *why); void rxrpc_process_connection(struct work_struct *); void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb); +static inline bool rxrpc_is_conn_aborted(const struct rxrpc_connection *conn) +{ + /* Order reading the abort info after the state check. */ + return smp_load_acquire(&conn->state) == RXRPC_CONN_ABORTED; +} + /* * conn_object.c */ @@ -1059,6 +1066,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *); +void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 13aac3ca03a0..666430182dfd 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -443,14 +443,10 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->state = RXRPC_CALL_SERVER_RECV_REQUEST; break; - case RXRPC_CONN_REMOTELY_ABORTED: - __rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, + case RXRPC_CONN_ABORTED: + __rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); break; - case RXRPC_CONN_LOCALLY_ABORTED: - __rxrpc_abort_call("CON", call, 1, - conn->abort_code, conn->error); - break; default: BUG(); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 7a980a32344f..753d91a9646f 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -16,6 +16,60 @@ #include #include "ar-internal.h" +/* + * Set the completion state on an aborted connection. + */ +static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb, + s32 abort_code, int err, + enum rxrpc_call_completion compl) +{ + bool aborted = false; + + if (conn->state != RXRPC_CONN_ABORTED) { + spin_lock(&conn->state_lock); + if (conn->state != RXRPC_CONN_ABORTED) { + conn->abort_code = abort_code; + conn->error = err; + conn->completion = compl; + /* Order the abort info before the state change. */ + smp_store_release(&conn->state, RXRPC_CONN_ABORTED); + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events); + aborted = true; + } + spin_unlock(&conn->state_lock); + } + + return aborted; +} + +/* + * Mark a socket buffer to indicate that the connection it's on should be aborted. + */ +int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, + s32 abort_code, int err, const char *why) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (rxrpc_set_conn_aborted(conn, skb, abort_code, err, + RXRPC_CALL_LOCALLY_ABORTED)) { + trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, + sp->hdr.seq, abort_code, err); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort); + } + return -EPROTO; +} + +/* + * Mark a connection as being remotely aborted. + */ +static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + RXRPC_CALL_REMOTELY_ABORTED); +} + /* * Retransmit terminal ACK or ABORT of the previous call. */ @@ -146,9 +200,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, /* * pass a connection-level abort onto all calls on that connection */ -static void rxrpc_abort_calls(struct rxrpc_connection *conn, - enum rxrpc_call_completion compl, - rxrpc_serial_t serial) +static void rxrpc_abort_calls(struct rxrpc_connection *conn) { struct rxrpc_call *call; int i; @@ -161,102 +213,17 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, call = rcu_dereference_protected( conn->channels[i].call, lockdep_is_held(&conn->bundle->channel_lock)); - if (call) { - if (compl == RXRPC_CALL_LOCALLY_ABORTED) - trace_rxrpc_abort(call->debug_id, - "CON", call->cid, - call->call_id, 0, - conn->abort_code, - conn->error); - else - trace_rxrpc_rx_abort(call, serial, - conn->abort_code); - rxrpc_set_call_completion(call, compl, + if (call) + rxrpc_set_call_completion(call, + conn->completion, conn->abort_code, conn->error); - } } spin_unlock(&conn->bundle->channel_lock); _leave(""); } -/* - * generate a connection-level abort - */ -static int rxrpc_abort_connection(struct rxrpc_connection *conn, - int error, u32 abort_code) -{ - struct rxrpc_wire_header whdr; - struct msghdr msg; - struct kvec iov[2]; - __be32 word; - size_t len; - u32 serial; - int ret; - - _enter("%d,,%u,%u", conn->debug_id, error, abort_code); - - /* generate a connection-level abort */ - spin_lock(&conn->state_lock); - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - spin_unlock(&conn->state_lock); - _leave(" = 0 [already dead]"); - return 0; - } - - conn->error = error; - conn->abort_code = abort_code; - conn->state = RXRPC_CONN_LOCALLY_ABORTED; - set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - spin_unlock(&conn->state_lock); - - msg.msg_name = &conn->peer->srx.transport; - msg.msg_namelen = conn->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - whdr.epoch = htonl(conn->proto.epoch); - whdr.cid = htonl(conn->proto.cid); - whdr.callNumber = 0; - whdr.seq = 0; - whdr.type = RXRPC_PACKET_TYPE_ABORT; - whdr.flags = conn->out_clientflag; - whdr.userStatus = 0; - whdr.securityIndex = conn->security_ix; - whdr._rsvd = 0; - whdr.serviceId = htons(conn->service_id); - - word = htonl(conn->abort_code); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = &word; - iov[1].iov_len = sizeof(word); - - len = iov[0].iov_len + iov[1].iov_len; - - serial = atomic_inc_return(&conn->serial); - rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, serial); - whdr.serial = htonl(serial); - - ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); - if (ret < 0) { - trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_point_conn_abort); - _debug("sendmsg failed: %d", ret); - return -EAGAIN; - } - - trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort); - - conn->peer->last_tx_at = ktime_get_seconds(); - - _leave(" = 0"); - return 0; -} - /* * mark a call as being on a now-secured channel * - must be called with BH's disabled. @@ -278,26 +245,22 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call) * connection-level Rx packet processor */ static int rxrpc_process_event(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); int loop, ret; - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - _leave(" = -ECONNABORTED [%u]", conn->state); + if (conn->state == RXRPC_CONN_ABORTED) return -ECONNABORTED; - } _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_CHALLENGE: - return conn->security->respond_to_challenge(conn, skb, - _abort_code); + return conn->security->respond_to_challenge(conn, skb); case RXRPC_PACKET_TYPE_RESPONSE: - ret = conn->security->verify_response(conn, skb, _abort_code); + ret = conn->security->verify_response(conn, skb); if (ret < 0) return ret; @@ -336,26 +299,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, */ static void rxrpc_secure_connection(struct rxrpc_connection *conn) { - u32 abort_code; - int ret; - - _enter("{%d}", conn->debug_id); - - ASSERT(conn->security_ix != 0); - - if (conn->security->issue_challenge(conn) < 0) { - abort_code = RX_CALL_DEAD; - ret = -ENOMEM; - goto abort; - } - - _leave(""); - return; - -abort: - _debug("abort %d, %d", ret, abort_code); - rxrpc_abort_connection(conn, ret, abort_code); - _leave(" [aborted]"); + if (conn->security->issue_challenge(conn) < 0) + rxrpc_abort_conn(conn, NULL, RX_CALL_DEAD, -ENOMEM, "OOM"); } /* @@ -406,7 +351,6 @@ again: static void rxrpc_do_process_connection(struct rxrpc_connection *conn) { struct sk_buff *skb; - u32 abort_code = RX_PROTOCOL_ERROR; int ret; if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) @@ -416,33 +360,18 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn) * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { rxrpc_see_skb(skb, rxrpc_skb_see_conn_work); - ret = rxrpc_process_event(conn, skb, &abort_code); + ret = rxrpc_process_event(conn, skb); switch (ret) { - case -EPROTO: - case -EKEYEXPIRED: - case -EKEYREJECTED: - goto protocol_error; case -ENOMEM: case -EAGAIN: - goto requeue_and_leave; - case -ECONNABORTED: + skb_queue_head(&conn->rx_queue, skb); + rxrpc_queue_conn(conn, rxrpc_conn_queue_retry_work); + break; default: rxrpc_free_skb(skb, rxrpc_skb_put_conn_work); break; } } - - return; - -requeue_and_leave: - skb_queue_head(&conn->rx_queue, skb); - return; - -protocol_error: - if (rxrpc_abort_connection(conn, ret, abort_code) < 0) - goto requeue_and_leave; - rxrpc_free_skb(skb, rxrpc_skb_put_conn_work); - return; } void rxrpc_process_connection(struct work_struct *work) @@ -480,28 +409,25 @@ int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - _leave(" = -ECONNABORTED [%u]", conn->state); - return 0; - } - - _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); - switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_BUSY: /* Just ignore BUSY packets for now. */ return 0; case RXRPC_PACKET_TYPE_ABORT: - conn->error = -ECONNABORTED; - conn->abort_code = skb->priority; - conn->state = RXRPC_CONN_REMOTELY_ABORTED; - set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); - return 0; + if (rxrpc_is_conn_aborted(conn)) + return true; + rxrpc_input_conn_abort(conn, skb); + rxrpc_abort_calls(conn); + return true; case RXRPC_PACKET_TYPE_CHALLENGE: case RXRPC_PACKET_TYPE_RESPONSE: + if (rxrpc_is_conn_aborted(conn)) { + if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) + rxrpc_send_conn_abort(conn); + return true; + } rxrpc_post_packet_to_conn(conn, skb); return 0; @@ -517,6 +443,9 @@ int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) */ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) { + if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) + rxrpc_abort_calls(conn); + /* Process delayed ACKs whose time has come. */ if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, false); diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 0eb8471bfc53..29dcc7d3f51a 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -43,25 +43,15 @@ static void none_free_call_crypto(struct rxrpc_call *call) } static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("chall_none")); - return -EPROTO; + return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, "RXN"); } static int none_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("resp_none")); - return -EPROTO; + return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, "RXN"); } static void none_clear(struct rxrpc_connection *conn) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 3d8c9f830ee0..8a5ff2c9e061 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -544,6 +544,62 @@ send_fragmentable: goto done; } +/* + * Transmit a connection-level abort. + */ +void rxrpc_send_conn_abort(struct rxrpc_connection *conn) +{ + struct rxrpc_wire_header whdr; + struct msghdr msg; + struct kvec iov[2]; + __be32 word; + size_t len; + u32 serial; + int ret; + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(conn->proto.epoch); + whdr.cid = htonl(conn->proto.cid); + whdr.callNumber = 0; + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ABORT; + whdr.flags = conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(conn->service_id); + + word = htonl(conn->abort_code); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = &word; + iov[1].iov_len = sizeof(word); + + len = iov[0].iov_len + iov[1].iov_len; + + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len); + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret < 0) { + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_conn_abort); + _debug("sendmsg failed: %d", ret); + return; + } + + trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort); + + conn->peer->last_tx_at = ktime_get_seconds(); +} + /* * Reject a packet through the local endpoint. */ diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 3a59591ec061..63947cce4048 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -17,8 +17,7 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ", [RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ", [RXRPC_CONN_SERVICE] = "SvSecure", - [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", - [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CONN_ABORTED] = "Aborted ", }; /* @@ -143,6 +142,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) { struct rxrpc_connection *conn; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + const char *state; char lbuff[50], rbuff[50]; if (v == &rxnet->conn_proc_list) { @@ -163,9 +163,11 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) } sprintf(lbuff, "%pISpc", &conn->local->srx.transport); - sprintf(rbuff, "%pISpc", &conn->peer->srx.transport); print: + state = rxrpc_is_conn_aborted(conn) ? + rxrpc_call_completions[conn->completion] : + rxrpc_conn_states[conn->state]; seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %s %3u %3d" " %s %08x %08x %08x %08x %08x %08x %08x\n", @@ -176,7 +178,7 @@ print: rxrpc_conn_is_service(conn) ? "Svc" : "Clt", refcount_read(&conn->ref), atomic_read(&conn->active), - rxrpc_conn_states[conn->state], + state, key_serial(conn->key), atomic_read(&conn->serial), conn->hi_serial, diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index d1233720e05f..5d2fbc6ec3cf 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -821,8 +821,7 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, * respond to a challenge packet */ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { const struct rxrpc_key_token *token; struct rxkad_challenge challenge; @@ -898,7 +897,7 @@ protocol_error: trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); ret = -EPROTO; other_error: - *_abort_code = abort_code; + rxrpc_abort_conn(conn, skb, abort_code, ret, "RXK"); return ret; } @@ -910,8 +909,7 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, void *ticket, size_t ticket_len, struct rxrpc_crypt *_session_key, - time64_t *_expiry, - u32 *_abort_code) + time64_t *_expiry) { struct skcipher_request *req; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -1042,8 +1040,7 @@ bad_ticket: abort_code = RXKADBADTICKET; ret = -EPROTO; other_error: - *_abort_code = abort_code; - return ret; + return rxrpc_abort_conn(conn, skb, abort_code, ret, "RXK"); temporary_error: return ret; } @@ -1086,8 +1083,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, * verify a response */ static int rxkad_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { struct rxkad_response *response; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -1115,11 +1111,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, abort_code = RXKADNOAUTH; break; } - trace_rxrpc_abort(0, "SVK", - sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - abort_code, PTR_ERR(server_key)); - *_abort_code = abort_code; - return -EPROTO; + return rxrpc_abort_conn(conn, skb, abort_code, + PTR_ERR(server_key), "RXK"); } ret = -ENOMEM; @@ -1168,7 +1161,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, goto temporary_error_free_ticket; ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len, - &session_key, &expiry, _abort_code); + &session_key, &expiry); if (ret < 0) goto temporary_error_free_ticket; @@ -1246,10 +1239,9 @@ protocol_error_free: kfree(ticket); protocol_error: kfree(response); - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); key_put(server_key); - *_abort_code = abort_code; - return -EPROTO; + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); + return rxrpc_abort_conn(conn, skb, abort_code, -EPROTO, "RXK"); temporary_error_free_ticket: kfree(ticket); -- cgit v1.2.3 From 57af281e5389b6fefedb3685f86847cbb0055f75 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 6 Oct 2022 21:45:42 +0100 Subject: rxrpc: Tidy up abort generation infrastructure Tidy up the abort generation infrastructure in the following ways: (1) Create an enum and string mapping table to list the reasons an abort might be generated in tracing. (2) Replace the 3-char string with the values from (1) in the places that use that to log the abort source. This gets rid of a memcpy() in the tracepoint. (3) Subsume the rxrpc_rx_eproto tracepoint with the rxrpc_abort tracepoint and use values from (1) to indicate the trace reason. (4) Always make a call to an abort function at the point of the abort rather than stashing the values into variables and using goto to get to a place where it reported. The C optimiser will collapse the calls together as appropriate. The abort functions return a value that can be returned directly if appropriate. Note that this extends into afs also at the points where that generates an abort. To aid with this, the afs sources need to #define RXRPC_TRACE_ONLY_DEFINE_ENUMS before including the rxrpc tracing header because they don't have access to the rxrpc internal structures that some of the tracepoints make use of. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 51 ++++---- net/rxrpc/call_accept.c | 43 ++++--- net/rxrpc/call_event.c | 13 +- net/rxrpc/call_object.c | 6 +- net/rxrpc/conn_event.c | 19 ++- net/rxrpc/input.c | 65 +++++----- net/rxrpc/insecure.c | 6 +- net/rxrpc/io_thread.c | 163 +++++++++++------------- net/rxrpc/recvmsg.c | 18 +-- net/rxrpc/rxkad.c | 321 ++++++++++++++++++++---------------------------- net/rxrpc/rxperf.c | 17 ++- net/rxrpc/security.c | 14 +-- net/rxrpc/sendmsg.c | 20 +-- 13 files changed, 351 insertions(+), 405 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 78bd6fb0bc15..120ce3ccbb22 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -627,9 +627,10 @@ struct rxrpc_call { unsigned long events; spinlock_t notify_lock; /* Kernel notification lock */ rwlock_t state_lock; /* lock for state transition */ - const char *send_abort_why; /* String indicating why the abort was sent */ + unsigned int send_abort_why; /* Why the abort [enum rxrpc_abort_reason] */ s32 send_abort; /* Abort code to be sent */ short send_abort_err; /* Error to be associated with the abort */ + rxrpc_seq_t send_abort_seq; /* DATA packet that incurred the abort (or 0) */ s32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ enum rxrpc_call_state state; /* current state of call */ @@ -818,9 +819,11 @@ extern struct workqueue_struct *rxrpc_workqueue; */ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); -int rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_peer *, - struct rxrpc_connection *, struct sockaddr_rxrpc *, - struct sk_buff *); +bool rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_peer *peer, + struct rxrpc_connection *conn, + struct sockaddr_rxrpc *peer_srx, + struct sk_buff *skb); void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); @@ -840,7 +843,7 @@ void rxrpc_reduce_call_timer(struct rxrpc_call *call, unsigned long now, enum rxrpc_timer_trace why); -void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); +bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); /* * call_object.c @@ -905,10 +908,10 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *); void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct sk_buff *skb, unsigned int channel); int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, - s32 abort_code, int err, const char *why); + s32 abort_code, int err, enum rxrpc_abort_reason why); void rxrpc_process_connection(struct work_struct *); void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); -int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); +bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb); static inline bool rxrpc_is_conn_aborted(const struct rxrpc_connection *conn) @@ -979,12 +982,19 @@ void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); */ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); void rxrpc_error_report(struct sock *); +bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, + s32 abort_code, int err); int rxrpc_io_thread(void *data); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { wake_up_process(local->io_thread); } +static inline bool rxrpc_protocol_error(struct sk_buff *skb, enum rxrpc_abort_reason why) +{ + return rxrpc_direct_abort(skb, why, RX_PROTOCOL_ERROR, -EPROTO); +} + /* * insecure.c */ @@ -1108,29 +1118,26 @@ bool __rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion bool rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); bool __rxrpc_call_completed(struct rxrpc_call *); bool rxrpc_call_completed(struct rxrpc_call *); -bool __rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); -bool rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); +bool __rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why); +bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* * Abort a call due to a protocol error. */ -static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, - struct sk_buff *skb, - const char *eproto_why, - const char *why, - u32 abort_code) +static inline int rxrpc_abort_eproto(struct rxrpc_call *call, + struct sk_buff *skb, + s32 abort_code, + enum rxrpc_abort_reason why) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - trace_rxrpc_rx_eproto(call, sp->hdr.serial, eproto_why); - return rxrpc_abort_call(why, call, sp->hdr.seq, abort_code, -EPROTO); + rxrpc_abort_call(call, sp->hdr.seq, abort_code, -EPROTO, why); + return -EPROTO; } -#define rxrpc_abort_eproto(call, skb, eproto_why, abort_why, abort_code) \ - __rxrpc_abort_eproto((call), (skb), tracepoint_string(eproto_why), \ - (abort_why), (abort_code)) - /* * rtt.c */ @@ -1162,8 +1169,8 @@ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *, /* * sendmsg.c */ -bool rxrpc_propose_abort(struct rxrpc_call *call, - u32 abort_code, int error, const char *why); +bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, + enum rxrpc_abort_reason why); int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index c957e4415cdc..a132d486dea0 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -326,11 +326,11 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * If we want to report an error, we mark the skb with the packet type and * abort code and return false. */ -int rxrpc_new_incoming_call(struct rxrpc_local *local, - struct rxrpc_peer *peer, - struct rxrpc_connection *conn, - struct sockaddr_rxrpc *peer_srx, - struct sk_buff *skb) +bool rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_peer *peer, + struct rxrpc_connection *conn, + struct sockaddr_rxrpc *peer_srx, + struct sk_buff *skb) { const struct rxrpc_security *sec = NULL; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -339,10 +339,9 @@ int rxrpc_new_incoming_call(struct rxrpc_local *local, _enter(""); - /* Don't set up a call for anything other than the first DATA packet. */ - if (sp->hdr.seq != 1 || - sp->hdr.type != RXRPC_PACKET_TYPE_DATA) - return 0; /* Just discard */ + /* Don't set up a call for anything other than a DATA packet. */ + if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) + return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); rcu_read_lock(); @@ -363,16 +362,14 @@ int rxrpc_new_incoming_call(struct rxrpc_local *local, if (!conn) { sec = rxrpc_get_incoming_security(rx, skb); if (!sec) - goto reject; + goto unsupported_security; } spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { - trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber, - sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = RX_INVALID_OPERATION; + rxrpc_direct_abort(skb, rxrpc_abort_shut_down, + RX_INVALID_OPERATION, -ESHUTDOWN); goto no_call; } @@ -413,22 +410,24 @@ int rxrpc_new_incoming_call(struct rxrpc_local *local, _leave(" = %p{%d}", call, call->debug_id); rxrpc_input_call_event(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return 0; + return true; unsupported_service: - trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EOPNOTSUPP); - skb->priority = RX_INVALID_OPERATION; - goto reject; + rcu_read_unlock(); + return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, + RX_INVALID_OPERATION, -EOPNOTSUPP); +unsupported_security: + rcu_read_unlock(); + return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, + RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); -reject: rcu_read_unlock(); _leave(" = f [%u]", skb->mark); - return -EPROTO; + return false; discard: rcu_read_unlock(); - return 0; + return true; } /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index b2fc3fa686ec..695aeb70d1a6 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -333,7 +333,7 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call) /* * Handle retransmission and deferred ACK/abort generation. */ -void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) +bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) { unsigned long now, next, t; rxrpc_serial_t ackr_serial; @@ -352,8 +352,8 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) /* Handle abort request locklessly, vs rxrpc_propose_abort(). */ abort_code = smp_load_acquire(&call->send_abort); if (abort_code) { - rxrpc_abort_call(call->send_abort_why, call, 0, call->send_abort, - call->send_abort_err); + rxrpc_abort_call(call, 0, call->send_abort, call->send_abort_err, + call->send_abort_why); goto out; } @@ -440,9 +440,11 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && (int)call->conn->hi_serial - (int)call->rx_serial > 0) { trace_rxrpc_call_reset(call); - rxrpc_abort_call("EXP", call, 0, RX_CALL_DEAD, -ECONNRESET); + rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ECONNRESET, + rxrpc_abort_call_reset); } else { - rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, -ETIME); + rxrpc_abort_call(call, 0, RX_CALL_TIMEOUT, -ETIME, + rxrpc_abort_call_timeout); } goto out; } @@ -494,4 +496,5 @@ out: if (call->acks_hard_ack != call->tx_bottom) rxrpc_shrink_call_tx_buffer(call); _leave(""); + return true; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 666430182dfd..705f6e26cc75 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -581,7 +581,8 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); list_del(&call->accept_link); - rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, "SKR"); + rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, + rxrpc_abort_call_sock_release_tba); rxrpc_put_call(call, rxrpc_call_put_release_sock_tba); } @@ -589,7 +590,8 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) call = list_entry(rx->sock_calls.next, struct rxrpc_call, sock_link); rxrpc_get_call(call, rxrpc_call_get_release_sock); - rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, "SKT"); + rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, + rxrpc_abort_call_sock_release); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put_release_sock); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 753d91a9646f..485d7f0fed2c 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -47,7 +47,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff * Mark a socket buffer to indicate that the connection it's on should be aborted. */ int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, - s32 abort_code, int err, const char *why) + s32 abort_code, int err, enum rxrpc_abort_reason why) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -288,8 +288,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return 0; default: - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("bad_conn_pkt")); + WARN_ON_ONCE(1); return -EPROTO; } } @@ -300,7 +299,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, static void rxrpc_secure_connection(struct rxrpc_connection *conn) { if (conn->security->issue_challenge(conn) < 0) - rxrpc_abort_conn(conn, NULL, RX_CALL_DEAD, -ENOMEM, "OOM"); + rxrpc_abort_conn(conn, NULL, RX_CALL_DEAD, -ENOMEM, + rxrpc_abort_nomem); } /* @@ -405,14 +405,14 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, /* * Input a connection-level packet. */ -int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) +bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_BUSY: /* Just ignore BUSY packets for now. */ - return 0; + return true; case RXRPC_PACKET_TYPE_ABORT: if (rxrpc_is_conn_aborted(conn)) @@ -429,12 +429,11 @@ int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) return true; } rxrpc_post_packet_to_conn(conn, skb); - return 0; + return true; default: - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("bad_conn_pkt")); - return -EPROTO; + WARN_ON_ONCE(1); + return true; } } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index bb4beb445325..bd69ff2d9082 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -9,10 +9,10 @@ #include "ar-internal.h" -static void rxrpc_proto_abort(const char *why, - struct rxrpc_call *call, rxrpc_seq_t seq) +static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, + enum rxrpc_abort_reason why) { - rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG); + rxrpc_abort_call(call, seq, RX_PROTOCOL_ERROR, -EBADMSG, why); } /* @@ -249,8 +249,8 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, * This occurs when we get an ACKALL packet, the first DATA packet of a reply, * or a final ACK packet. */ -static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, - const char *abort_why) +static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, + enum rxrpc_abort_reason abort_why) { unsigned int state; @@ -283,13 +283,12 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, else trace_rxrpc_txqueue(call, rxrpc_txqueue_end); _leave(" = ok"); - return true; + return; bad_state: write_unlock(&call->state_lock); kdebug("end_tx %s", rxrpc_call_states[call->state]); - rxrpc_proto_abort(abort_why, call, call->tx_top); - return false; + rxrpc_proto_abort(call, call->tx_top, abort_why); } /* @@ -311,11 +310,13 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { if (!rxrpc_rotate_tx_window(call, top, &summary)) { - rxrpc_proto_abort("TXL", call, top); + rxrpc_proto_abort(call, top, rxrpc_eproto_early_reply); return false; } } - return rxrpc_end_tx_phase(call, true, "ETD"); + + rxrpc_end_tx_phase(call, true, rxrpc_eproto_unexpected_reply); + return true; } static void rxrpc_input_update_ack_window(struct rxrpc_call *call, @@ -365,17 +366,14 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, if (last) { if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) && - seq + 1 != wtop) { - rxrpc_proto_abort("LSN", call, seq); - return; - } + seq + 1 != wtop) + return rxrpc_proto_abort(call, seq, rxrpc_eproto_different_last); } else { if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && after_eq(seq, wtop)) { pr_warn("Packet beyond last: c=%x q=%x window=%x-%x wlimit=%x\n", call->debug_id, seq, window, wtop, wlimit); - rxrpc_proto_abort("LSA", call, seq); - return; + return rxrpc_proto_abort(call, seq, rxrpc_eproto_data_after_last); } } @@ -583,7 +581,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) goto out_notify; if (!rxrpc_input_split_jumbo(call, skb)) { - rxrpc_proto_abort("VLD", call, sp->hdr.seq); + rxrpc_proto_abort(call, sp->hdr.seq, rxrpc_badmsg_bad_jumbo); goto out_notify; } skb = NULL; @@ -764,7 +762,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) offset = sizeof(struct rxrpc_wire_header); if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) - return rxrpc_proto_abort("XAK", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack); offset += sizeof(ack); ack_serial = sp->hdr.serial; @@ -844,7 +842,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) ioffset = offset + nr_acks + 3; if (skb->len >= ioffset + sizeof(info) && skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) - return rxrpc_proto_abort("XAI", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack_info); if (nr_acks > 0) skb_condense(skb); @@ -867,7 +865,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_input_ackinfo(call, skb, &info); if (first_soft_ack == 0) - return rxrpc_proto_abort("AK0", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); /* Ignore ACKs unless we are or have just been transmitting. */ switch (READ_ONCE(call->state)) { @@ -882,20 +880,20 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (before(hard_ack, call->acks_hard_ack) || after(hard_ack, call->tx_top)) - return rxrpc_proto_abort("AKW", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window); if (nr_acks > call->tx_top - hard_ack) - return rxrpc_proto_abort("AKN", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow); if (after(hard_ack, call->acks_hard_ack)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { - rxrpc_end_tx_phase(call, false, "ETA"); + rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); return; } } if (nr_acks > 0) { if (offset > (int)skb->len - nr_acks) - return rxrpc_proto_abort("XSA", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack); rxrpc_input_soft_acks(call, skb->data + offset, first_soft_ack, nr_acks, &summary); } @@ -917,7 +915,7 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_ack_summary summary = { 0 }; if (rxrpc_rotate_tx_window(call, call->tx_top, &summary)) - rxrpc_end_tx_phase(call, false, "ETL"); + rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ackall); } /* @@ -962,27 +960,23 @@ void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: - rxrpc_input_data(call, skb); - break; + return rxrpc_input_data(call, skb); case RXRPC_PACKET_TYPE_ACK: - rxrpc_input_ack(call, skb); - break; + return rxrpc_input_ack(call, skb); case RXRPC_PACKET_TYPE_BUSY: /* Just ignore BUSY packets from the server; the retry and * lifespan timers will take care of business. BUSY packets * from the client don't make sense. */ - break; + return; case RXRPC_PACKET_TYPE_ABORT: - rxrpc_input_abort(call, skb); - break; + return rxrpc_input_abort(call, skb); case RXRPC_PACKET_TYPE_ACKALL: - rxrpc_input_ackall(call, skb); - break; + return rxrpc_input_ackall(call, skb); default: break; @@ -1004,7 +998,8 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_CALL_COMPLETE: break; default: - rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN); + rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ESHUTDOWN, + rxrpc_eproto_improper_term); trace_rxrpc_improper_term(call); break; } diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 29dcc7d3f51a..34353b6e584b 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -45,13 +45,15 @@ static void none_free_call_crypto(struct rxrpc_call *call) static int none_respond_to_challenge(struct rxrpc_connection *conn, struct sk_buff *skb) { - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, "RXN"); + return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxrpc_eproto_rxnull_challenge); } static int none_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb) { - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, "RXN"); + return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxrpc_eproto_rxnull_response); } static void none_clear(struct rxrpc_connection *conn) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 46e58cf5bc96..33fd2394c8b3 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -66,10 +66,32 @@ void rxrpc_error_report(struct sock *sk) rcu_read_unlock(); } +/* + * Directly produce an abort from a packet. + */ +bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, + s32 abort_code, int err) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + abort_code, err); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = abort_code; + return false; +} + +static bool rxrpc_bad_message(struct sk_buff *skb, enum rxrpc_abort_reason why) +{ + return rxrpc_direct_abort(skb, why, RX_PROTOCOL_ERROR, -EBADMSG); +} + +#define just_discard true + /* * Process event packets targeted at a local endpoint. */ -static void rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb) +static bool rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); char v; @@ -81,22 +103,21 @@ static void rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb) if (v == 0) rxrpc_send_version_request(local, &sp->hdr, skb); } + + return true; } /* * Extract the wire header from a packet and translate the byte order. */ -static noinline -int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) +static bool rxrpc_extract_header(struct rxrpc_skb_priv *sp, + struct sk_buff *skb) { struct rxrpc_wire_header whdr; /* dig out the RxRPC connection details */ - if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) { - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("bad_hdr")); - return -EBADMSG; - } + if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) + return rxrpc_bad_message(skb, rxrpc_badmsg_short_hdr); memset(sp, 0, sizeof(*sp)); sp->hdr.epoch = ntohl(whdr.epoch); @@ -110,7 +131,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) sp->hdr.securityIndex = whdr.securityIndex; sp->hdr._rsvd = ntohs(whdr._rsvd); sp->hdr.serviceId = ntohs(whdr.serviceId); - return 0; + return true; } /* @@ -130,28 +151,28 @@ static bool rxrpc_extract_abort(struct sk_buff *skb) /* * Process packets received on the local endpoint */ -static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) +static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) { struct rxrpc_connection *conn; struct sockaddr_rxrpc peer_srx; struct rxrpc_skb_priv *sp; struct rxrpc_peer *peer = NULL; struct sk_buff *skb = *_skb; - int ret = 0; + bool ret = false; skb_pull(skb, sizeof(struct udphdr)); sp = rxrpc_skb(skb); /* dig out the RxRPC connection details */ - if (rxrpc_extract_header(sp, skb) < 0) - goto bad_message; + if (!rxrpc_extract_header(sp, skb)) + return just_discard; if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { trace_rxrpc_rx_lose(sp); - return 0; + return just_discard; } } @@ -160,28 +181,28 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: if (rxrpc_to_client(sp)) - return 0; - rxrpc_input_version(local, skb); - return 0; + return just_discard; + return rxrpc_input_version(local, skb); case RXRPC_PACKET_TYPE_BUSY: if (rxrpc_to_server(sp)) - return 0; + return just_discard; fallthrough; case RXRPC_PACKET_TYPE_ACK: case RXRPC_PACKET_TYPE_ACKALL: if (sp->hdr.callNumber == 0) - goto bad_message; + return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call); break; case RXRPC_PACKET_TYPE_ABORT: if (!rxrpc_extract_abort(skb)) - return 0; /* Just discard if malformed */ + return just_discard; /* Just discard if malformed */ break; case RXRPC_PACKET_TYPE_DATA: - if (sp->hdr.callNumber == 0 || - sp->hdr.seq == 0) - goto bad_message; + if (sp->hdr.callNumber == 0) + return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call); + if (sp->hdr.seq == 0) + return rxrpc_bad_message(skb, rxrpc_badmsg_zero_seq); /* Unshare the packet so that it can be modified for in-place * decryption. @@ -191,7 +212,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) if (!skb) { rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare_nomem); *_skb = NULL; - return 0; + return just_discard; } if (skb != *_skb) { @@ -205,28 +226,28 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) case RXRPC_PACKET_TYPE_CHALLENGE: if (rxrpc_to_server(sp)) - return 0; + return just_discard; break; case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_to_client(sp)) - return 0; + return just_discard; break; /* Packet types 9-11 should just be ignored. */ case RXRPC_PACKET_TYPE_PARAMS: case RXRPC_PACKET_TYPE_10: case RXRPC_PACKET_TYPE_11: - return 0; + return just_discard; default: - goto bad_message; + return rxrpc_bad_message(skb, rxrpc_badmsg_unsupported_packet); } if (sp->hdr.serviceId == 0) - goto bad_message; + return rxrpc_bad_message(skb, rxrpc_badmsg_zero_service); if (WARN_ON_ONCE(rxrpc_extract_addr_from_skb(&peer_srx, skb) < 0)) - return true; /* Unsupported address type - discard. */ + return just_discard; /* Unsupported address type. */ if (peer_srx.transport.family != local->srx.transport.family && (peer_srx.transport.family == AF_INET && @@ -234,7 +255,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", peer_srx.transport.family, local->srx.transport.family); - return true; /* Wrong address type - discard. */ + return just_discard; /* Wrong address type. */ } if (rxrpc_to_client(sp)) { @@ -242,12 +263,8 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) conn = rxrpc_find_client_connection_rcu(local, &peer_srx, skb); conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); rcu_read_unlock(); - if (!conn) { - trace_rxrpc_abort(0, "NCC", sp->hdr.cid, - sp->hdr.callNumber, sp->hdr.seq, - RXKADINCONSISTENCY, EBADMSG); - goto protocol_error; - } + if (!conn) + return rxrpc_protocol_error(skb, rxrpc_eproto_no_client_conn); ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb); rxrpc_put_connection(conn, rxrpc_conn_put_call_input); @@ -280,19 +297,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) ret = rxrpc_new_incoming_call(local, peer, NULL, &peer_srx, skb); rxrpc_put_peer(peer, rxrpc_peer_put_input); - if (ret < 0) - goto reject_packet; - return 0; - -bad_message: - trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_PROTOCOL_ERROR, EBADMSG); -protocol_error: - skb->priority = RX_PROTOCOL_ERROR; - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; -reject_packet: - rxrpc_reject_packet(local, skb); - return 0; + return ret; } /* @@ -306,21 +311,23 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; unsigned int channel; + bool ret; if (sp->hdr.securityIndex != conn->security_ix) - goto wrong_security; + return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security, + RXKADINCONSISTENCY, -EBADMSG); if (sp->hdr.serviceId != conn->service_id) { int old_id; if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) - goto reupgrade; + return rxrpc_protocol_error(skb, rxrpc_eproto_reupgrade); + old_id = cmpxchg(&conn->service_id, conn->orig_service_id, sp->hdr.serviceId); - if (old_id != conn->orig_service_id && old_id != sp->hdr.serviceId) - goto reupgrade; + return rxrpc_protocol_error(skb, rxrpc_eproto_bad_upgrade); } if (after(sp->hdr.serial, conn->hi_serial)) @@ -336,19 +343,19 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, /* Ignore really old calls */ if (sp->hdr.callNumber < chan->last_call) - return 0; + return just_discard; if (sp->hdr.callNumber == chan->last_call) { if (chan->call || sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) - return 0; + return just_discard; /* For the previous service call, if completed successfully, we * discard all further packets. */ if (rxrpc_conn_is_service(conn) && chan->last_type == RXRPC_PACKET_TYPE_ACK) - return 0; + return just_discard; /* But otherwise we need to retransmit the final packet from * data cached in the connection record. @@ -359,7 +366,7 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, sp->hdr.serial, sp->hdr.flags); rxrpc_conn_retransmit_call(conn, skb, channel); - return 0; + return just_discard; } rcu_read_lock(); @@ -370,7 +377,8 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (sp->hdr.callNumber > chan->call_id) { if (rxrpc_to_client(sp)) { rxrpc_put_call(call, rxrpc_call_put_input); - goto reject_packet; + return rxrpc_protocol_error(skb, + rxrpc_eproto_unexpected_implicit_end); } if (call) { @@ -382,38 +390,14 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (!call) { if (rxrpc_to_client(sp)) - goto bad_message; - if (rxrpc_new_incoming_call(conn->local, conn->peer, conn, - peer_srx, skb) == 0) - return 0; - goto reject_packet; + return rxrpc_protocol_error(skb, rxrpc_eproto_no_client_call); + return rxrpc_new_incoming_call(conn->local, conn->peer, conn, + peer_srx, skb); } - rxrpc_input_call_event(call, skb); + ret = rxrpc_input_call_event(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return 0; - -wrong_security: - trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RXKADINCONSISTENCY, EBADMSG); - skb->priority = RXKADINCONSISTENCY; - goto post_abort; - -reupgrade: - trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_PROTOCOL_ERROR, EBADMSG); - goto protocol_error; - -bad_message: - trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_PROTOCOL_ERROR, EBADMSG); -protocol_error: - skb->priority = RX_PROTOCOL_ERROR; -post_abort: - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; -reject_packet: - rxrpc_reject_packet(conn->local, skb); - return 0; + return ret; } /* @@ -470,7 +454,8 @@ int rxrpc_io_thread(void *data) switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: skb->priority = 0; - rxrpc_input_packet(local, &skb); + if (!rxrpc_input_packet(local, &skb)) + rxrpc_reject_packet(local, skb); trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_free_skb(skb, rxrpc_skb_put_input); break; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 8d5fe65f5951..59b521b82aec 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -117,8 +117,8 @@ bool rxrpc_call_completed(struct rxrpc_call *call) /* * Record that a call is locally aborted. */ -bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, - rxrpc_seq_t seq, u32 abort_code, int error) +bool __rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why) { trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, abort_code, error); @@ -126,13 +126,13 @@ bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, abort_code, error); } -bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, - rxrpc_seq_t seq, u32 abort_code, int error) +bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why) { bool ret; write_lock(&call->state_lock); - ret = __rxrpc_abort_call(why, call, seq, abort_code, error); + ret = __rxrpc_abort_call(call, seq, abort_code, error, why); write_unlock(&call->state_lock); if (ret && test_bit(RXRPC_CALL_EXPOSED, &call->flags)) rxrpc_send_abort_packet(call); @@ -642,11 +642,15 @@ out: return ret; short_data: - trace_rxrpc_rx_eproto(call, 0, tracepoint_string("short_data")); + trace_rxrpc_abort(call->debug_id, rxrpc_recvmsg_short_data, + call->cid, call->call_id, call->rx_consumed, + 0, -EBADMSG); ret = -EBADMSG; goto out; excess_data: - trace_rxrpc_rx_eproto(call, 0, tracepoint_string("excess_data")); + trace_rxrpc_abort(call->debug_id, rxrpc_recvmsg_excess_data, + call->cid, call->call_id, call->rx_consumed, + 0, -EMSGSIZE); ret = -EMSGSIZE; goto out; call_complete: diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 5d2fbc6ec3cf..e52cb8058156 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -411,18 +411,15 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; struct scatterlist sg[16]; - bool aborted; u32 data_size, buf; u16 check; int ret; _enter(""); - if (sp->len < 8) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_hdr", "V1H", - RXKADSEALEDINCON); - goto protocol_error; - } + if (sp->len < 8) + return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_1_short_header); /* Decrypt the skbuff in-place. TODO: We really want to decrypt * directly into the target buffer. @@ -442,11 +439,9 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_zero(req); /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_len", "XV1", - RXKADDATALEN); - goto protocol_error; - } + if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) + return rxrpc_abort_eproto(call, skb, RXKADDATALEN, + rxkad_abort_1_short_encdata); sp->offset += sizeof(sechdr); sp->len -= sizeof(sechdr); @@ -456,26 +451,16 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, check = buf >> 16; check ^= seq ^ call->call_id; check &= 0xffff; - if (check != 0) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_check", "V1C", - RXKADSEALEDINCON); - goto protocol_error; - } - - if (data_size > sp->len) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_datalen", "V1L", - RXKADDATALEN); - goto protocol_error; - } + if (check != 0) + return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_1_short_check); + if (data_size > sp->len) + return rxrpc_abort_eproto(call, skb, RXKADDATALEN, + rxkad_abort_1_short_data); sp->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; - -protocol_error: - if (aborted) - rxrpc_send_abort_packet(call); - return -EPROTO; } /* @@ -490,18 +475,15 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; struct scatterlist _sg[4], *sg; - bool aborted; u32 data_size, buf; u16 check; int nsg, ret; _enter(",{%d}", sp->len); - if (sp->len < 8) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_hdr", "V2H", - RXKADSEALEDINCON); - goto protocol_error; - } + if (sp->len < 8) + return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_2_short_header); /* Decrypt the skbuff in-place. TODO: We really want to decrypt * directly into the target buffer. @@ -513,7 +495,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, } else { sg = kmalloc_array(nsg, sizeof(*sg), GFP_NOIO); if (!sg) - goto nomem; + return -ENOMEM; } sg_init_table(sg, nsg); @@ -537,11 +519,9 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, kfree(sg); /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_len", "XV2", - RXKADDATALEN); - goto protocol_error; - } + if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) + return rxrpc_abort_eproto(call, skb, RXKADDATALEN, + rxkad_abort_2_short_len); sp->offset += sizeof(sechdr); sp->len -= sizeof(sechdr); @@ -551,30 +531,17 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, check = buf >> 16; check ^= seq ^ call->call_id; check &= 0xffff; - if (check != 0) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_check", "V2C", - RXKADSEALEDINCON); - goto protocol_error; - } + if (check != 0) + return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_2_short_check); - if (data_size > sp->len) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_datalen", "V2L", - RXKADDATALEN); - goto protocol_error; - } + if (data_size > sp->len) + return rxrpc_abort_eproto(call, skb, RXKADDATALEN, + rxkad_abort_2_short_data); sp->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; - -protocol_error: - if (aborted) - rxrpc_send_abort_packet(call); - return -EPROTO; - -nomem: - _leave(" = -ENOMEM"); - return -ENOMEM; } /* @@ -590,7 +557,6 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) __be32 buf[2]; } crypto __aligned(8); rxrpc_seq_t seq = sp->hdr.seq; - bool aborted; int ret; u16 cksum; u32 x, y; @@ -627,9 +593,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) cksum = 1; /* zero checksums are not permitted */ if (cksum != sp->hdr.cksum) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_csum", "VCK", - RXKADSEALEDINCON); - goto protocol_error; + ret = rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_bad_checksum); + goto out; } switch (call->conn->security_level) { @@ -647,13 +613,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) break; } +out: skcipher_request_free(req); return ret; - -protocol_error: - if (aborted) - rxrpc_send_abort_packet(call); - return -EPROTO; } /* @@ -827,27 +789,24 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, struct rxkad_challenge challenge; struct rxkad_response *resp; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - const char *eproto; - u32 version, nonce, min_level, abort_code; - int ret; + u32 version, nonce, min_level; + int ret = -EPROTO; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); - eproto = tracepoint_string("chall_no_key"); - abort_code = RX_PROTOCOL_ERROR; if (!conn->key) - goto protocol_error; + return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxkad_abort_chall_no_key); - abort_code = RXKADEXPIRED; ret = key_validate(conn->key); if (ret < 0) - goto other_error; + return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); - eproto = tracepoint_string("chall_short"); - abort_code = RXKADPACKETSHORT; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &challenge, sizeof(challenge)) < 0) - goto protocol_error; + return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_chall_short); version = ntohl(challenge.version); nonce = ntohl(challenge.nonce); @@ -855,15 +814,13 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level); - eproto = tracepoint_string("chall_ver"); - abort_code = RXKADINCONSISTENCY; if (version != RXKAD_VERSION) - goto protocol_error; + return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_chall_version); - abort_code = RXKADLEVELFAIL; - ret = -EACCES; if (conn->security_level < min_level) - goto other_error; + return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, + rxkad_abort_chall_level); token = conn->key->payload.data[0]; @@ -892,13 +849,6 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); kfree(resp); return ret; - -protocol_error: - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); - ret = -EPROTO; -other_error: - rxrpc_abort_conn(conn, skb, abort_code, ret, "RXK"); - return ret; } /* @@ -912,16 +862,12 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, time64_t *_expiry) { struct skcipher_request *req; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv, key; struct scatterlist sg[1]; struct in_addr addr; unsigned int life; - const char *eproto; time64_t issue, now; bool little_endian; - int ret; - u32 abort_code; u8 *p, *q, *name, *end; _enter("{%d},{%x}", conn->debug_id, key_serial(server_key)); @@ -933,10 +879,9 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, memcpy(&iv, &server_key->payload.data[2], sizeof(iv)); - ret = -ENOMEM; req = skcipher_request_alloc(server_key->payload.data[0], GFP_NOFS); if (!req) - goto temporary_error; + return -ENOMEM; sg_init_one(&sg[0], ticket, ticket_len); skcipher_request_set_callback(req, 0, NULL, NULL); @@ -947,18 +892,21 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, p = ticket; end = p + ticket_len; -#define Z(field) \ - ({ \ - u8 *__str = p; \ - eproto = tracepoint_string("rxkad_bad_"#field); \ - q = memchr(p, 0, end - p); \ - if (!q || q - p > (field##_SZ)) \ - goto bad_ticket; \ - for (; p < q; p++) \ - if (!isprint(*p)) \ - goto bad_ticket; \ - p++; \ - __str; \ +#define Z(field, fieldl) \ + ({ \ + u8 *__str = p; \ + q = memchr(p, 0, end - p); \ + if (!q || q - p > field##_SZ) \ + return rxrpc_abort_conn( \ + conn, skb, RXKADBADTICKET, -EPROTO, \ + rxkad_abort_resp_tkt_##fieldl); \ + for (; p < q; p++) \ + if (!isprint(*p)) \ + return rxrpc_abort_conn( \ + conn, skb, RXKADBADTICKET, -EPROTO, \ + rxkad_abort_resp_tkt_##fieldl); \ + p++; \ + __str; \ }) /* extract the ticket flags */ @@ -967,20 +915,20 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, p++; /* extract the authentication name */ - name = Z(ANAME); + name = Z(ANAME, aname); _debug("KIV ANAME: %s", name); /* extract the principal's instance */ - name = Z(INST); + name = Z(INST, inst); _debug("KIV INST : %s", name); /* extract the principal's authentication domain */ - name = Z(REALM); + name = Z(REALM, realm); _debug("KIV REALM: %s", name); - eproto = tracepoint_string("rxkad_bad_len"); if (end - p < 4 + 8 + 4 + 2) - goto bad_ticket; + return rxrpc_abort_conn(conn, skb, RXKADBADTICKET, -EPROTO, + rxkad_abort_resp_tkt_short); /* get the IPv4 address of the entity that requested the ticket */ memcpy(&addr, p, sizeof(addr)); @@ -1012,37 +960,23 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, _debug("KIV ISSUE: %llx [%llx]", issue, now); /* check the ticket is in date */ - if (issue > now) { - abort_code = RXKADNOAUTH; - ret = -EKEYREJECTED; - goto other_error; - } - - if (issue < now - life) { - abort_code = RXKADEXPIRED; - ret = -EKEYEXPIRED; - goto other_error; - } + if (issue > now) + return rxrpc_abort_conn(conn, skb, RXKADNOAUTH, -EKEYREJECTED, + rxkad_abort_resp_tkt_future); + if (issue < now - life) + return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, -EKEYEXPIRED, + rxkad_abort_resp_tkt_expired); *_expiry = issue + life; /* get the service name */ - name = Z(SNAME); + name = Z(SNAME, sname); _debug("KIV SNAME: %s", name); /* get the service instance name */ - name = Z(INST); + name = Z(INST, sinst); _debug("KIV SINST: %s", name); return 0; - -bad_ticket: - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); - abort_code = RXKADBADTICKET; - ret = -EPROTO; -other_error: - return rxrpc_abort_conn(conn, skb, abort_code, ret, "RXK"); -temporary_error: - return ret; } /* @@ -1089,10 +1023,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; struct key *server_key; - const char *eproto; time64_t expiry; void *ticket; - u32 abort_code, version, kvno, ticket_len, level; + u32 version, kvno, ticket_len, level; __be32 csum; int ret, i; @@ -1100,19 +1033,18 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, server_key = rxrpc_look_up_server_security(conn, skb, 0, 0); if (IS_ERR(server_key)) { - switch (PTR_ERR(server_key)) { + ret = PTR_ERR(server_key); + switch (ret) { case -ENOKEY: - abort_code = RXKADUNKNOWNKEY; - break; + return rxrpc_abort_conn(conn, skb, RXKADUNKNOWNKEY, ret, + rxkad_abort_resp_nokey); case -EKEYEXPIRED: - abort_code = RXKADEXPIRED; - break; + return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_resp_key_expired); default: - abort_code = RXKADNOAUTH; - break; + return rxrpc_abort_conn(conn, skb, RXKADNOAUTH, ret, + rxkad_abort_resp_key_rejected); } - return rxrpc_abort_conn(conn, skb, abort_code, - PTR_ERR(server_key), "RXK"); } ret = -ENOMEM; @@ -1120,11 +1052,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (!response) goto temporary_error; - eproto = tracepoint_string("rxkad_rsp_short"); - abort_code = RXKADPACKETSHORT; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - response, sizeof(*response)) < 0) + response, sizeof(*response)) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_resp_short); goto protocol_error; + } version = ntohl(response->version); ticket_len = ntohl(response->ticket_len); @@ -1132,20 +1065,23 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, trace_rxrpc_rx_response(conn, sp->hdr.serial, version, kvno, ticket_len); - eproto = tracepoint_string("rxkad_rsp_ver"); - abort_code = RXKADINCONSISTENCY; - if (version != RXKAD_VERSION) + if (version != RXKAD_VERSION) { + rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_resp_version); goto protocol_error; + } - eproto = tracepoint_string("rxkad_rsp_tktlen"); - abort_code = RXKADTICKETLEN; - if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN) + if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN) { + rxrpc_abort_conn(conn, skb, RXKADTICKETLEN, -EPROTO, + rxkad_abort_resp_tkt_len); goto protocol_error; + } - eproto = tracepoint_string("rxkad_rsp_unkkey"); - abort_code = RXKADUNKNOWNKEY; - if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5) + if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5) { + rxrpc_abort_conn(conn, skb, RXKADUNKNOWNKEY, -EPROTO, + rxkad_abort_resp_unknown_tkt); goto protocol_error; + } /* extract the kerberos ticket and decrypt and decode it */ ret = -ENOMEM; @@ -1153,12 +1089,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (!ticket) goto temporary_error_free_resp; - eproto = tracepoint_string("rxkad_tkt_short"); - abort_code = RXKADPACKETSHORT; - ret = skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), - ticket, ticket_len); - if (ret < 0) - goto temporary_error_free_ticket; + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), + ticket, ticket_len) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_resp_short_tkt); + goto protocol_error; + } ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len, &session_key, &expiry); @@ -1169,56 +1105,66 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, * response */ rxkad_decrypt_response(conn, response, &session_key); - eproto = tracepoint_string("rxkad_rsp_param"); - abort_code = RXKADSEALEDINCON; - if (ntohl(response->encrypted.epoch) != conn->proto.epoch) - goto protocol_error_free; - if (ntohl(response->encrypted.cid) != conn->proto.cid) - goto protocol_error_free; - if (ntohl(response->encrypted.securityIndex) != conn->security_ix) + if (ntohl(response->encrypted.epoch) != conn->proto.epoch || + ntohl(response->encrypted.cid) != conn->proto.cid || + ntohl(response->encrypted.securityIndex) != conn->security_ix) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_bad_param); goto protocol_error_free; + } + csum = response->encrypted.checksum; response->encrypted.checksum = 0; rxkad_calc_response_checksum(response); - eproto = tracepoint_string("rxkad_rsp_csum"); - if (response->encrypted.checksum != csum) + if (response->encrypted.checksum != csum) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_bad_checksum); goto protocol_error_free; + } spin_lock(&conn->bundle->channel_lock); for (i = 0; i < RXRPC_MAXCALLS; i++) { struct rxrpc_call *call; u32 call_id = ntohl(response->encrypted.call_id[i]); - eproto = tracepoint_string("rxkad_rsp_callid"); - if (call_id > INT_MAX) + if (call_id > INT_MAX) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_bad_callid); goto protocol_error_unlock; + } - eproto = tracepoint_string("rxkad_rsp_callctr"); - if (call_id < conn->channels[i].call_counter) + if (call_id < conn->channels[i].call_counter) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_call_ctr); goto protocol_error_unlock; + } - eproto = tracepoint_string("rxkad_rsp_callst"); if (call_id > conn->channels[i].call_counter) { call = rcu_dereference_protected( conn->channels[i].call, lockdep_is_held(&conn->bundle->channel_lock)); - if (call && call->state < RXRPC_CALL_COMPLETE) + if (call && call->state < RXRPC_CALL_COMPLETE) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_call_state); goto protocol_error_unlock; + } conn->channels[i].call_counter = call_id; } } spin_unlock(&conn->bundle->channel_lock); - eproto = tracepoint_string("rxkad_rsp_seq"); - abort_code = RXKADOUTOFSEQUENCE; - if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1) + if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1) { + rxrpc_abort_conn(conn, skb, RXKADOUTOFSEQUENCE, -EPROTO, + rxkad_abort_resp_ooseq); goto protocol_error_free; + } - eproto = tracepoint_string("rxkad_rsp_level"); - abort_code = RXKADLEVELFAIL; level = ntohl(response->encrypted.level); - if (level > RXRPC_SECURITY_ENCRYPT) + if (level > RXRPC_SECURITY_ENCRYPT) { + rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EPROTO, + rxkad_abort_resp_level); goto protocol_error_free; + } conn->security_level = level; /* create a key to hold the security data and expiration time - after @@ -1240,8 +1186,7 @@ protocol_error_free: protocol_error: kfree(response); key_put(server_key); - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); - return rxrpc_abort_conn(conn, skb, abort_code, -EPROTO, "RXK"); + return -EPROTO; temporary_error_free_ticket: kfree(ticket); diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index d33a109e846c..16dcabb71ebe 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -10,6 +10,8 @@ #include #include #include +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include MODULE_DESCRIPTION("rxperf test server (afs)"); MODULE_AUTHOR("Red Hat, Inc."); @@ -307,12 +309,14 @@ static void rxperf_deliver_to_call(struct work_struct *work) case -EOPNOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - abort_code, ret, "GOP"); + abort_code, ret, + rxperf_abort_op_not_supported); goto call_complete; case -ENOTSUPP: abort_code = RX_USER_ABORT; rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - abort_code, ret, "GUA"); + abort_code, ret, + rxperf_abort_op_not_supported); goto call_complete; case -EIO: pr_err("Call %u in bad state %u\n", @@ -324,11 +328,13 @@ static void rxperf_deliver_to_call(struct work_struct *work) case -ENOMEM: case -EFAULT: rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - RXGEN_SS_UNMARSHAL, ret, "GUM"); + RXGEN_SS_UNMARSHAL, ret, + rxperf_abort_unmarshal_error); goto call_complete; default: rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - RX_CALL_DEAD, ret, "GER"); + RX_CALL_DEAD, ret, + rxperf_abort_general_error); goto call_complete; } } @@ -523,7 +529,8 @@ static int rxperf_process_call(struct rxperf_call *call) if (n == -ENOMEM) rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - RXGEN_SS_MARSHAL, -ENOMEM, "GOM"); + RXGEN_SS_MARSHAL, -ENOMEM, + rxperf_abort_oom); return n; } diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index ab968f65a490..78af14694618 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -144,21 +144,15 @@ const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *rx, sec = rxrpc_security_lookup(sp->hdr.securityIndex); if (!sec) { - trace_rxrpc_abort(0, "SVS", - sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EKEYREJECTED); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = RX_INVALID_OPERATION; + rxrpc_direct_abort(skb, rxrpc_abort_unsupported_security, + RX_INVALID_OPERATION, -EKEYREJECTED); return NULL; } if (sp->hdr.securityIndex != RXRPC_SECURITY_NONE && !rx->securities) { - trace_rxrpc_abort(0, "SVR", - sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EKEYREJECTED); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = sec->no_key_abort; + rxrpc_direct_abort(skb, rxrpc_abort_no_service_key, + sec->no_key_abort, -EKEYREJECTED); return NULL; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index dc3c2a834fc8..d67808b659f1 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -20,14 +20,15 @@ /* * Propose an abort to be made in the I/O thread. */ -bool rxrpc_propose_abort(struct rxrpc_call *call, - u32 abort_code, int error, const char *why) +bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, + enum rxrpc_abort_reason why) { - _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); + _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); if (!call->send_abort && call->state < RXRPC_CALL_COMPLETE) { call->send_abort_why = why; call->send_abort_err = error; + call->send_abort_seq = 0; /* Request abort locklessly vs rxrpc_input_call_event(). */ smp_store_release(&call->send_abort, abort_code); rxrpc_poke_call(call, rxrpc_call_poke_abort); @@ -683,7 +684,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) /* it's too late for this call */ ret = -ESHUTDOWN; } else if (p.command == RXRPC_CMD_SEND_ABORT) { - rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, "CMD"); + rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, + rxrpc_abort_call_sendmsg); ret = 0; } else if (p.command != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; @@ -748,7 +750,9 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, break; default: /* Request phase complete for this client call */ - trace_rxrpc_rx_eproto(call, 0, tracepoint_string("late_send")); + trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, + call->cid, call->call_id, call->rx_consumed, + 0, -EPROTO); ret = -EPROTO; break; } @@ -766,17 +770,17 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data); * @call: The call to be aborted * @abort_code: The abort code to stick into the ABORT packet * @error: Local error value - * @why: 3-char string indicating why. + * @why: Indication as to why. * * Allow a kernel service to abort a call, if it's still in an abortable state * and return true if the call was aborted, false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, - u32 abort_code, int error, const char *why) + u32 abort_code, int error, enum rxrpc_abort_reason why) { bool aborted; - _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); + _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); mutex_lock(&call->user_mutex); aborted = rxrpc_propose_abort(call, abort_code, error, why); -- cgit v1.2.3 From f06cb29189361353e9ed12df936c8e1d7f69b730 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 22:58:56 +0100 Subject: rxrpc: Make the set of connection IDs per local endpoint Make the set of connection IDs per local endpoint so that endpoints don't cause each other's connections to get dismissed. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/af_rxrpc.c | 8 -------- net/rxrpc/ar-internal.h | 5 +++-- net/rxrpc/conn_client.c | 44 +++++++++++++++++++------------------------- net/rxrpc/conn_object.c | 6 +++--- net/rxrpc/local_object.c | 10 ++++++++++ 5 files changed, 35 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7ea576f6ba4b..6f6a6b77ee84 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -957,16 +957,9 @@ static const struct net_proto_family rxrpc_family_ops = { static int __init af_rxrpc_init(void) { int ret = -1; - unsigned int tmp; BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof_field(struct sk_buff, cb)); - get_random_bytes(&tmp, sizeof(tmp)); - tmp &= 0x3fffffff; - if (tmp == 0) - tmp = 1; - idr_set_cursor(&rxrpc_client_conn_ids, tmp); - ret = -ENOMEM; rxrpc_call_jar = kmem_cache_create( "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, @@ -1062,7 +1055,6 @@ static void __exit af_rxrpc_exit(void) * are released. */ rcu_barrier(); - rxrpc_destroy_client_conn_ids(); destroy_workqueue(rxrpc_workqueue); rxrpc_exit_security(); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 120ce3ccbb22..e9ab06100a21 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -300,6 +300,8 @@ struct rxrpc_local { int debug_id; /* debug ID for printks */ bool dead; bool service_closed; /* Service socket closed */ + struct idr conn_ids; /* List of connection IDs */ + spinlock_t conn_lock; /* Lock for client connection pool */ struct sockaddr_rxrpc srx; /* local address */ }; @@ -887,9 +889,8 @@ static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) extern unsigned int rxrpc_reap_client_connections; extern unsigned long rxrpc_conn_idle_client_expiry; extern unsigned long rxrpc_conn_idle_client_fast_expiry; -extern struct idr rxrpc_client_conn_ids; -void rxrpc_destroy_client_conn_ids(void); +void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local); struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); void rxrpc_put_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *, diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 1edd65883c55..59ce5c08cf57 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -34,12 +34,6 @@ __read_mostly unsigned int rxrpc_reap_client_connections = 900; __read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; -/* - * We use machine-unique IDs for our client connections. - */ -DEFINE_IDR(rxrpc_client_conn_ids); -static DEFINE_SPINLOCK(rxrpc_conn_id_lock); - static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); /* @@ -51,65 +45,65 @@ static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, gfp_t gfp) { - struct rxrpc_net *rxnet = conn->rxnet; + struct rxrpc_local *local = conn->local; int id; _enter(""); idr_preload(gfp); - spin_lock(&rxrpc_conn_id_lock); + spin_lock(&local->conn_lock); - id = idr_alloc_cyclic(&rxrpc_client_conn_ids, conn, + id = idr_alloc_cyclic(&local->conn_ids, conn, 1, 0x40000000, GFP_NOWAIT); if (id < 0) goto error; - spin_unlock(&rxrpc_conn_id_lock); + spin_unlock(&local->conn_lock); idr_preload_end(); - conn->proto.epoch = rxnet->epoch; + conn->proto.epoch = local->rxnet->epoch; conn->proto.cid = id << RXRPC_CIDSHIFT; set_bit(RXRPC_CONN_HAS_IDR, &conn->flags); _leave(" [CID %x]", conn->proto.cid); return 0; error: - spin_unlock(&rxrpc_conn_id_lock); + spin_unlock(&local->conn_lock); idr_preload_end(); _leave(" = %d", id); return id; } /* - * Release a connection ID for a client connection from the global pool. + * Release a connection ID for a client connection. */ -static void rxrpc_put_client_connection_id(struct rxrpc_connection *conn) +static void rxrpc_put_client_connection_id(struct rxrpc_local *local, + struct rxrpc_connection *conn) { if (test_bit(RXRPC_CONN_HAS_IDR, &conn->flags)) { - spin_lock(&rxrpc_conn_id_lock); - idr_remove(&rxrpc_client_conn_ids, - conn->proto.cid >> RXRPC_CIDSHIFT); - spin_unlock(&rxrpc_conn_id_lock); + spin_lock(&local->conn_lock); + idr_remove(&local->conn_ids, conn->proto.cid >> RXRPC_CIDSHIFT); + spin_unlock(&local->conn_lock); } } /* * Destroy the client connection ID tree. */ -void rxrpc_destroy_client_conn_ids(void) +void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local) { struct rxrpc_connection *conn; int id; - if (!idr_is_empty(&rxrpc_client_conn_ids)) { - idr_for_each_entry(&rxrpc_client_conn_ids, conn, id) { + if (!idr_is_empty(&local->conn_ids)) { + idr_for_each_entry(&local->conn_ids, conn, id) { pr_err("AF_RXRPC: Leaked client conn %p {%d}\n", conn, refcount_read(&conn->ref)); } BUG(); } - idr_destroy(&rxrpc_client_conn_ids); + idr_destroy(&local->conn_ids); } /* @@ -225,7 +219,7 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) return conn; error_1: - rxrpc_put_client_connection_id(conn); + rxrpc_put_client_connection_id(bundle->local, conn); error_0: kfree(conn); _leave(" = %d", ret); @@ -257,7 +251,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) * times the maximum number of client conns away from the current * allocation point to try and keep the IDs concentrated. */ - id_cursor = idr_get_cursor(&rxrpc_client_conn_ids); + id_cursor = idr_get_cursor(&conn->local->conn_ids); id = conn->proto.cid >> RXRPC_CIDSHIFT; distance = id - id_cursor; if (distance < 0) @@ -982,7 +976,7 @@ void rxrpc_kill_client_conn(struct rxrpc_connection *conn) trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); atomic_dec(&rxnet->nr_client_conns); - rxrpc_put_client_connection_id(conn); + rxrpc_put_client_connection_id(local, conn); } /* diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 281f59e356f5..2e3f0a222e1b 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -100,10 +100,10 @@ struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *lo _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); - /* Look up client connections by connection ID alone as their IDs are - * unique for this machine. + /* Look up client connections by connection ID alone as their + * IDs are unique for this machine. */ - conn = idr_find(&rxrpc_client_conn_ids, sp->hdr.cid >> RXRPC_CIDSHIFT); + conn = idr_find(&local->conn_ids, sp->hdr.cid >> RXRPC_CIDSHIFT); if (!conn || refcount_read(&conn->ref) == 0) { _debug("no conn"); goto not_found; diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 8ef6cd8defa4..ca8b3ee68b59 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -89,6 +89,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; + u32 tmp; local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); if (local) { @@ -109,6 +110,14 @@ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; + idr_init(&local->conn_ids); + get_random_bytes(&tmp, sizeof(tmp)); + tmp &= 0x3fffffff; + if (tmp == 0) + tmp = 1; + idr_set_cursor(&local->conn_ids, tmp); + spin_lock_init(&local->conn_lock); + trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, 1); } @@ -409,6 +418,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local) * local endpoint. */ rxrpc_purge_queue(&local->rx_queue); + rxrpc_destroy_client_conn_ids(local); } /* -- cgit v1.2.3 From 2953d3b8d8fd1188034c54862b74402b0b846695 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 08:54:03 +0100 Subject: rxrpc: Offload the completion of service conn security to the I/O thread Offload the completion of the challenge/response cycle on a service connection to the I/O thread. After the RESPONSE packet has been successfully decrypted and verified by the work queue, offloading the changing of the call states to the I/O thread makes iteration over the conn's channel list simpler. Do this by marking the RESPONSE skbuff and putting it onto the receive queue for the I/O thread to collect. We put it on the front of the queue as we've already received the packet for it. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/conn_event.c | 46 ++++++++++++++++++++++++++++++++-------------- net/rxrpc/io_thread.c | 5 +++++ 3 files changed, 38 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e9ab06100a21..e508ec221b75 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -38,6 +38,7 @@ struct rxrpc_txbuf; enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ + RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 485d7f0fed2c..b2042702ca9a 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -248,7 +248,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - int loop, ret; + int ret; if (conn->state == RXRPC_CONN_ABORTED) return -ECONNABORTED; @@ -269,22 +269,21 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - spin_lock(&conn->bundle->channel_lock); spin_lock(&conn->state_lock); - - if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { + if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) conn->state = RXRPC_CONN_SERVICE; - spin_unlock(&conn->state_lock); - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure( - rcu_dereference_protected( - conn->channels[loop].call, - lockdep_is_held(&conn->bundle->channel_lock))); - } else { - spin_unlock(&conn->state_lock); - } + spin_unlock(&conn->state_lock); - spin_unlock(&conn->bundle->channel_lock); + if (conn->state == RXRPC_CONN_SERVICE) { + /* Offload call state flipping to the I/O thread. As + * we've already received the packet, put it on the + * front of the queue. + */ + skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; + rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); + skb_queue_head(&conn->local->rx_queue, skb); + rxrpc_wake_up_io_thread(conn->local); + } return 0; default: @@ -442,9 +441,28 @@ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) */ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) { + unsigned int loop; + if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); + switch (skb->mark) { + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + if (conn->state != RXRPC_CONN_SERVICE) + break; + + spin_lock(&conn->bundle->channel_lock); + + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure( + rcu_dereference_protected( + conn->channels[loop].call, + lockdep_is_held(&conn->bundle->channel_lock))); + + spin_unlock(&conn->bundle->channel_lock); + break; + } + /* Process delayed ACKs whose time has come. */ if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, false); diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 33fd2394c8b3..751139b3c1ac 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -451,6 +451,7 @@ int rxrpc_io_thread(void *data) /* Process received packets and errors. */ if ((skb = __skb_dequeue(&rx_queue))) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: skb->priority = 0; @@ -463,6 +464,10 @@ int rxrpc_io_thread(void *data) rxrpc_input_error(local, skb); rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + rxrpc_input_conn_event(sp->conn, skb); + rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke); + rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); break; default: WARN_ON_ONCE(1); -- cgit v1.2.3 From 1bab27af6b88b5c811f99de4812b5590f20d1cb7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 09:30:23 +0100 Subject: rxrpc: Set up a connection bundle from a call, not rxrpc_conn_parameters Use the information now stored in struct rxrpc_call to configure the connection bundle and thence the connection, rather than using the rxrpc_conn_parameters struct. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/af_rxrpc.c | 1 - net/rxrpc/ar-internal.h | 8 +-- net/rxrpc/call_object.c | 4 +- net/rxrpc/conn_client.c | 132 ++++++++++++++++++++++++------------------------ net/rxrpc/conn_object.c | 2 +- net/rxrpc/sendmsg.c | 1 - 6 files changed, 75 insertions(+), 73 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 6f6a6b77ee84..f4e1ffff2ba4 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -328,7 +328,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, mutex_unlock(&call->user_mutex); } - rxrpc_put_peer(cp.peer, rxrpc_peer_put_discard_tmp); _leave(" = %p", call); return call; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e508ec221b75..2740c6333114 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -360,7 +360,6 @@ struct rxrpc_conn_proto { struct rxrpc_conn_parameters { struct rxrpc_local *local; /* Representation of local endpoint */ - struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ @@ -428,6 +427,7 @@ struct rxrpc_bundle { struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ + const struct rxrpc_security *security; /* applied security module */ refcount_t ref; atomic_t active; /* Number of active users */ unsigned int debug_id; @@ -593,6 +593,7 @@ enum rxrpc_congest_mode { struct rxrpc_call { struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ + struct rxrpc_bundle *bundle; /* Connection bundle to use */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_sock __rcu *socket; /* socket responsible */ @@ -894,11 +895,10 @@ extern unsigned long rxrpc_conn_idle_client_fast_expiry; void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local); struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); void rxrpc_put_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); -int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *, - struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, - gfp_t); +int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); +void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); void rxrpc_put_client_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_discard_expired_client_conns(struct work_struct *); void rxrpc_destroy_all_client_connections(struct rxrpc_net *); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 705f6e26cc75..835e9781afc6 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -365,7 +365,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ - ret = rxrpc_connect_call(rx, call, cp, srx, gfp); + ret = rxrpc_connect_call(call, gfp); if (ret < 0) goto error_attached_to_socket; @@ -663,6 +663,8 @@ static void rxrpc_destroy_call(struct work_struct *work) rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); + rxrpc_deactivate_bundle(call->bundle); + rxrpc_put_bundle(call->bundle, rxrpc_bundle_put_call); rxrpc_put_peer(call->peer, rxrpc_peer_put_call); rxrpc_put_local(call->local, rxrpc_local_put_call); call_rcu(&call->rcu, rxrpc_rcu_free_call); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 59ce5c08cf57..c0db7722571e 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -34,7 +34,10 @@ __read_mostly unsigned int rxrpc_reap_client_connections = 900; __read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; -static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); +static void rxrpc_activate_bundle(struct rxrpc_bundle *bundle) +{ + atomic_inc(&bundle->active); +} /* * Get a connection ID and epoch for a client connection from the global pool. @@ -109,20 +112,21 @@ void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local) /* * Allocate a connection bundle. */ -static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, +static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_bundle *bundle; bundle = kzalloc(sizeof(*bundle), gfp); if (bundle) { - bundle->local = cp->local; - bundle->peer = rxrpc_get_peer(cp->peer, rxrpc_peer_get_bundle); - bundle->key = cp->key; - bundle->exclusive = cp->exclusive; - bundle->upgrade = cp->upgrade; - bundle->service_id = cp->service_id; - bundle->security_level = cp->security_level; + bundle->local = call->local; + bundle->peer = rxrpc_get_peer(call->peer, rxrpc_peer_get_bundle); + bundle->key = key_get(call->key); + bundle->security = call->security; + bundle->exclusive = test_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); + bundle->upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); + bundle->service_id = call->dest_srx.srx_service; + bundle->security_level = call->security_level; refcount_set(&bundle->ref, 1); atomic_set(&bundle->active, 1); spin_lock_init(&bundle->channel_lock); @@ -146,19 +150,23 @@ static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_free); rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); + key_put(bundle->key); kfree(bundle); } void rxrpc_put_bundle(struct rxrpc_bundle *bundle, enum rxrpc_bundle_trace why) { - unsigned int id = bundle->debug_id; + unsigned int id; bool dead; int r; - dead = __refcount_dec_and_test(&bundle->ref, &r); - trace_rxrpc_bundle(id, r - 1, why); - if (dead) - rxrpc_free_bundle(bundle); + if (bundle) { + id = bundle->debug_id; + dead = __refcount_dec_and_test(&bundle->ref, &r); + trace_rxrpc_bundle(id, r - 1, why); + if (dead) + rxrpc_free_bundle(bundle); + } } /* @@ -272,20 +280,23 @@ dont_reuse: * Look up the conn bundle that matches the connection parameters, adding it if * it doesn't yet exist. */ -static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *cp, - gfp_t gfp) +static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) { static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle, *candidate; - struct rxrpc_local *local = cp->local; + struct rxrpc_local *local = call->local; struct rb_node *p, **pp, *parent; long diff; + bool upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); _enter("{%px,%x,%u,%u}", - cp->peer, key_serial(cp->key), cp->security_level, cp->upgrade); + call->peer, key_serial(call->key), call->security_level, + upgrade); - if (cp->exclusive) - return rxrpc_alloc_bundle(cp, gfp); + if (test_bit(RXRPC_CALL_EXCLUSIVE, &call->flags)) { + call->bundle = rxrpc_alloc_bundle(call, gfp); + return call->bundle; + } /* First, see if the bundle is already there. */ _debug("search 1"); @@ -294,11 +305,11 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c while (p) { bundle = rb_entry(p, struct rxrpc_bundle, local_node); -#define cmp(X) ((long)bundle->X - (long)cp->X) - diff = (cmp(peer) ?: - cmp(key) ?: - cmp(security_level) ?: - cmp(upgrade)); +#define cmp(X, Y) ((long)(X) - (long)(Y)) + diff = (cmp(bundle->peer, call->peer) ?: + cmp(bundle->key, call->key) ?: + cmp(bundle->security_level, call->security_level) ?: + cmp(bundle->upgrade, upgrade)); #undef cmp if (diff < 0) p = p->rb_left; @@ -311,9 +322,9 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c _debug("not found"); /* It wasn't. We need to add one. */ - candidate = rxrpc_alloc_bundle(cp, gfp); + candidate = rxrpc_alloc_bundle(call, gfp); if (!candidate) - return NULL; + return ERR_PTR(-ENOMEM); _debug("search 2"); spin_lock(&local->client_bundles_lock); @@ -323,11 +334,11 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c parent = *pp; bundle = rb_entry(parent, struct rxrpc_bundle, local_node); -#define cmp(X) ((long)bundle->X - (long)cp->X) - diff = (cmp(peer) ?: - cmp(key) ?: - cmp(security_level) ?: - cmp(upgrade)); +#define cmp(X, Y) ((long)(X) - (long)(Y)) + diff = (cmp(bundle->peer, call->peer) ?: + cmp(bundle->key, call->key) ?: + cmp(bundle->security_level, call->security_level) ?: + cmp(bundle->upgrade, upgrade)); #undef cmp if (diff < 0) pp = &(*pp)->rb_left; @@ -341,19 +352,19 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id); rb_link_node(&candidate->local_node, parent, pp); rb_insert_color(&candidate->local_node, &local->client_bundles); - rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); + call->bundle = rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); spin_unlock(&local->client_bundles_lock); - _leave(" = %u [new]", candidate->debug_id); - return candidate; + _leave(" = B=%u [new]", call->bundle->debug_id); + return call->bundle; found_bundle_free: rxrpc_free_bundle(candidate); found_bundle: - rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_call); - atomic_inc(&bundle->active); + call->bundle = rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_call); + rxrpc_activate_bundle(bundle); spin_unlock(&local->client_bundles_lock); - _leave(" = %u [found]", bundle->debug_id); - return bundle; + _leave(" = B=%u [found]", call->bundle->debug_id); + return call->bundle; } /* @@ -362,31 +373,25 @@ found_bundle: * If we return with a connection, the call will be on its waiting list. It's * left to the caller to assign a channel and wake up the call. */ -static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) +static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_bundle *bundle; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); - if (!cp->peer) + call->peer = rxrpc_lookup_peer(call->local, &call->dest_srx, gfp); + if (!call->peer) goto error; call->tx_last_sent = ktime_get_real(); - call->cong_ssthresh = cp->peer->cong_ssthresh; + call->cong_ssthresh = call->peer->cong_ssthresh; if (call->cong_cwnd >= call->cong_ssthresh) call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; else call->cong_mode = RXRPC_CALL_SLOW_START; - if (cp->upgrade) - __set_bit(RXRPC_CALL_UPGRADE, &call->flags); /* Find the client connection bundle. */ - bundle = rxrpc_look_up_bundle(cp, gfp); + bundle = rxrpc_look_up_bundle(call, gfp); if (!bundle) goto error; @@ -449,7 +454,7 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) if (old) trace_rxrpc_client(old, -1, rxrpc_client_replace); candidate->bundle_shift = shift; - atomic_inc(&bundle->active); + rxrpc_activate_bundle(bundle); bundle->conns[i] = candidate; for (j = 0; j < RXRPC_MAXCALLS; j++) set_bit(shift + j, &bundle->avail_chans); @@ -541,7 +546,6 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, rxrpc_see_call(call, rxrpc_call_see_activate_client); list_del_init(&call->chan_wait_link); - call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_activate_call); call->conn = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call); call->cid = conn->proto.cid | channel; call->call_id = call_id; @@ -705,14 +709,11 @@ out: * find a connection for a call * - called in process context with IRQs enabled */ -int rxrpc_connect_call(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) +int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_bundle *bundle; - struct rxrpc_net *rxnet = cp->local->rxnet; + struct rxrpc_local *local = call->local; + struct rxrpc_net *rxnet = local->rxnet; int ret = 0; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); @@ -721,7 +722,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, rxrpc_get_call(call, rxrpc_call_get_io_thread); - bundle = rxrpc_prep_call(rx, call, cp, srx, gfp); + bundle = rxrpc_prep_call(call, gfp); if (IS_ERR(bundle)) { rxrpc_put_call(call, rxrpc_call_get_io_thread); ret = PTR_ERR(bundle); @@ -738,9 +739,6 @@ granted_channel: /* Paired with the write barrier in rxrpc_activate_one_channel(). */ smp_rmb(); -out_put_bundle: - rxrpc_deactivate_bundle(bundle); - rxrpc_put_bundle(bundle, rxrpc_bundle_get_client_call); out: _leave(" = %d", ret); return ret; @@ -758,7 +756,7 @@ wait_failed: trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed); rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); rxrpc_disconnect_client_call(bundle, call); - goto out_put_bundle; + goto out; } /* @@ -945,11 +943,15 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) /* * Drop the active count on a bundle. */ -static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) +void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) { - struct rxrpc_local *local = bundle->local; + struct rxrpc_local *local; bool need_put = false; + if (!bundle) + return; + + local = bundle->local; if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) { if (!bundle->exclusive) { _debug("erase bundle"); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 2e3f0a222e1b..2a7d5378300c 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -208,7 +208,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) } if (rxrpc_is_client_call(call)) { - rxrpc_disconnect_client_call(conn->bundle, call); + rxrpc_disconnect_client_call(call->bundle, call); } else { spin_lock(&conn->bundle->channel_lock); __rxrpc_disconnect_call(conn, call); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index d67808b659f1..2a003c3a9897 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -564,7 +564,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ - rxrpc_put_peer(cp.peer, rxrpc_peer_put_discard_tmp); _leave(" = %p\n", call); return call; } -- cgit v1.2.3 From 0b9bb322f13d486d5b8630264ccbfb4794bb43a9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Oct 2022 00:16:55 +0100 Subject: rxrpc: Split out the call state changing functions into their own file Split out the functions that change the state of an rxrpc call into their own file. The idea being to remove anything to do with changing the state of a call directly from the rxrpc sendmsg() and recvmsg() paths and have all that done in the I/O thread only, with the ultimate aim of removing the state lock entirely. Moving the code out of sendmsg.c and recvmsg.c makes that easier to manage. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/Makefile | 1 + net/rxrpc/ar-internal.h | 26 ++++++++++----- net/rxrpc/call_state.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/recvmsg.c | 81 -------------------------------------------- 4 files changed, 108 insertions(+), 89 deletions(-) create mode 100644 net/rxrpc/call_state.c (limited to 'net') diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index e76d3459d78e..ac5caf5a48e1 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -10,6 +10,7 @@ rxrpc-y := \ call_accept.o \ call_event.o \ call_object.o \ + call_state.o \ conn_client.o \ conn_event.o \ conn_object.o \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 2740c6333114..203e0354d86b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -885,6 +885,24 @@ static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) return !rxrpc_is_service_call(call); } +/* + * call_state.c + */ +bool __rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error); +bool rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error); +bool __rxrpc_call_completed(struct rxrpc_call *call); +bool rxrpc_call_completed(struct rxrpc_call *call); +bool __rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why); +bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why); + /* * conn_client.c */ @@ -1116,14 +1134,6 @@ extern const struct seq_operations rxrpc_local_seq_ops; * recvmsg.c */ void rxrpc_notify_socket(struct rxrpc_call *); -bool __rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); -bool rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); -bool __rxrpc_call_completed(struct rxrpc_call *); -bool rxrpc_call_completed(struct rxrpc_call *); -bool __rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, - u32 abort_code, int error, enum rxrpc_abort_reason why); -bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, - u32 abort_code, int error, enum rxrpc_abort_reason why); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* diff --git a/net/rxrpc/call_state.c b/net/rxrpc/call_state.c new file mode 100644 index 000000000000..8fbb2112ed7e --- /dev/null +++ b/net/rxrpc/call_state.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Call state changing functions. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include "ar-internal.h" + +/* + * Transition a call to the complete state. + */ +bool __rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error) +{ + if (call->state < RXRPC_CALL_COMPLETE) { + call->abort_code = abort_code; + call->error = error; + call->completion = compl; + call->state = RXRPC_CALL_COMPLETE; + trace_rxrpc_call_complete(call); + wake_up(&call->waitq); + rxrpc_notify_socket(call); + return true; + } + return false; +} + +bool rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error) +{ + bool ret = false; + + if (call->state < RXRPC_CALL_COMPLETE) { + write_lock(&call->state_lock); + ret = __rxrpc_set_call_completion(call, compl, abort_code, error); + write_unlock(&call->state_lock); + } + return ret; +} + +/* + * Record that a call successfully completed. + */ +bool __rxrpc_call_completed(struct rxrpc_call *call) +{ + return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); +} + +bool rxrpc_call_completed(struct rxrpc_call *call) +{ + bool ret = false; + + if (call->state < RXRPC_CALL_COMPLETE) { + write_lock(&call->state_lock); + ret = __rxrpc_call_completed(call); + write_unlock(&call->state_lock); + } + return ret; +} + +/* + * Record that a call is locally aborted. + */ +bool __rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why) +{ + trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, + abort_code, error); + return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, + abort_code, error); +} + +bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why) +{ + bool ret; + + write_lock(&call->state_lock); + ret = __rxrpc_abort_call(call, seq, abort_code, error, why); + write_unlock(&call->state_lock); + if (ret && test_bit(RXRPC_CALL_EXPOSED, &call->flags)) + rxrpc_send_abort_packet(call); + return ret; +} diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 59b521b82aec..ff08f917ecda 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -58,87 +58,6 @@ void rxrpc_notify_socket(struct rxrpc_call *call) _leave(""); } -/* - * Transition a call to the complete state. - */ -bool __rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error) -{ - if (call->state < RXRPC_CALL_COMPLETE) { - call->abort_code = abort_code; - call->error = error; - call->completion = compl; - call->state = RXRPC_CALL_COMPLETE; - trace_rxrpc_call_complete(call); - wake_up(&call->waitq); - rxrpc_notify_socket(call); - return true; - } - return false; -} - -bool rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error) -{ - bool ret = false; - - if (call->state < RXRPC_CALL_COMPLETE) { - write_lock(&call->state_lock); - ret = __rxrpc_set_call_completion(call, compl, abort_code, error); - write_unlock(&call->state_lock); - } - return ret; -} - -/* - * Record that a call successfully completed. - */ -bool __rxrpc_call_completed(struct rxrpc_call *call) -{ - return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); -} - -bool rxrpc_call_completed(struct rxrpc_call *call) -{ - bool ret = false; - - if (call->state < RXRPC_CALL_COMPLETE) { - write_lock(&call->state_lock); - ret = __rxrpc_call_completed(call); - write_unlock(&call->state_lock); - } - return ret; -} - -/* - * Record that a call is locally aborted. - */ -bool __rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, - u32 abort_code, int error, enum rxrpc_abort_reason why) -{ - trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, - abort_code, error); - return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, - abort_code, error); -} - -bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, - u32 abort_code, int error, enum rxrpc_abort_reason why) -{ - bool ret; - - write_lock(&call->state_lock); - ret = __rxrpc_abort_call(call, seq, abort_code, error, why); - write_unlock(&call->state_lock); - if (ret && test_bit(RXRPC_CALL_EXPOSED, &call->flags)) - rxrpc_send_abort_packet(call); - return ret; -} - /* * Pass a call terminating message to userspace. */ -- cgit v1.2.3 From d41b3f5b96881809c73f86e3ca436c9426610b7a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Dec 2022 15:32:32 +0000 Subject: rxrpc: Wrap accesses to get call state to put the barrier in one place Wrap accesses to get the state of a call from outside of the I/O thread in a single place so that the barrier needed to order wrt the error code and abort code is in just that place. Also use a barrier when setting the call state and again when reading the call state such that the auxiliary completion info (error code, abort code) can be read without taking a read lock on the call state lock. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/af_rxrpc.c | 2 +- net/rxrpc/ar-internal.h | 16 ++++++++++++++++ net/rxrpc/call_state.c | 3 ++- net/rxrpc/recvmsg.c | 12 ++++++------ net/rxrpc/sendmsg.c | 29 +++++++++++++---------------- 5 files changed, 38 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index f4e1ffff2ba4..61c30d0f6735 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -379,7 +379,7 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) { - return call->state != RXRPC_CALL_COMPLETE; + return !rxrpc_call_is_complete(call); } EXPORT_SYMBOL(rxrpc_kernel_check_life); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 203e0354d86b..9e992487649c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -903,6 +903,22 @@ bool __rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, u32 abort_code, int error, enum rxrpc_abort_reason why); +static inline enum rxrpc_call_state rxrpc_call_state(const struct rxrpc_call *call) +{ + /* Order read ->state before read ->error. */ + return smp_load_acquire(&call->state); +} + +static inline bool rxrpc_call_is_complete(const struct rxrpc_call *call) +{ + return rxrpc_call_state(call) == RXRPC_CALL_COMPLETE; +} + +static inline bool rxrpc_call_has_failed(const struct rxrpc_call *call) +{ + return rxrpc_call_is_complete(call) && call->completion != RXRPC_CALL_SUCCEEDED; +} + /* * conn_client.c */ diff --git a/net/rxrpc/call_state.c b/net/rxrpc/call_state.c index 8fbb2112ed7e..649fb9e5d1af 100644 --- a/net/rxrpc/call_state.c +++ b/net/rxrpc/call_state.c @@ -19,7 +19,8 @@ bool __rxrpc_set_call_completion(struct rxrpc_call *call, call->abort_code = abort_code; call->error = error; call->completion = compl; - call->state = RXRPC_CALL_COMPLETE; + /* Allow reader of completion state to operate locklessly */ + smp_store_release(&call->state, RXRPC_CALL_COMPLETE); trace_rxrpc_call_complete(call); wake_up(&call->waitq); rxrpc_notify_socket(call); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index ff08f917ecda..7bf36a8839ec 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -89,7 +89,7 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp); break; default: - pr_err("Invalid terminal call state %u\n", call->state); + pr_err("Invalid terminal call state %u\n", call->completion); BUG(); break; } @@ -111,7 +111,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) trace_rxrpc_receive(call, rxrpc_receive_end, 0, whigh); - if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) + if (rxrpc_call_state(call) == RXRPC_CALL_CLIENT_RECV_REPLY) rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); write_lock(&call->state_lock); @@ -210,7 +210,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; - if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) { + if (rxrpc_call_state(call) >= RXRPC_CALL_SERVER_ACK_REQUEST) { seq = lower_32_bits(atomic64_read(&call->ackr_window)) - 1; ret = 1; goto done; @@ -416,7 +416,7 @@ try_again: msg->msg_namelen = len; } - switch (READ_ONCE(call->state)) { + switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_RECV_REPLY: case RXRPC_CALL_SERVER_RECV_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: @@ -436,7 +436,7 @@ try_again: if (ret < 0) goto error_unlock_call; - if (call->state == RXRPC_CALL_COMPLETE) { + if (rxrpc_call_is_complete(call)) { ret = rxrpc_recvmsg_term(call, msg); if (ret < 0) goto error_unlock_call; @@ -516,7 +516,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, mutex_lock(&call->user_mutex); - switch (READ_ONCE(call->state)) { + switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_RECV_REPLY: case RXRPC_CALL_SERVER_RECV_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 2a003c3a9897..f0b5822f3e04 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -25,7 +25,7 @@ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, { _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); - if (!call->send_abort && call->state < RXRPC_CALL_COMPLETE) { + if (!call->send_abort && !rxrpc_call_is_complete(call)) { call->send_abort_why = why; call->send_abort_err = error; call->send_abort_seq = 0; @@ -60,7 +60,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, if (rxrpc_check_tx_space(call, NULL)) return 0; - if (call->state >= RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) return call->error; if (signal_pending(current)) @@ -95,7 +95,7 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, if (rxrpc_check_tx_space(call, &tx_win)) return 0; - if (call->state >= RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) return call->error; if (timeout == 0 && @@ -124,7 +124,7 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, if (rxrpc_check_tx_space(call, NULL)) return 0; - if (call->state >= RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) return call->error; trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); @@ -273,7 +273,7 @@ reload: ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto maybe_error; - state = READ_ONCE(call->state); + state = rxrpc_call_state(call); ret = -ESHUTDOWN; if (state >= RXRPC_CALL_COMPLETE) goto maybe_error; @@ -350,7 +350,7 @@ reload: /* check for the far side aborting the call or a network error * occurring */ - if (call->state == RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) goto call_terminated; /* add the packet to the send queue if it's now full */ @@ -375,12 +375,9 @@ reload: success: ret = copied; - if (READ_ONCE(call->state) == RXRPC_CALL_COMPLETE) { - read_lock(&call->state_lock); - if (call->error < 0) - ret = call->error; - read_unlock(&call->state_lock); - } + if (rxrpc_call_is_complete(call) && + call->error < 0) + ret = call->error; out: call->tx_pending = txb; _leave(" = %d", ret); @@ -618,10 +615,10 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) return PTR_ERR(call); /* ... and we have the call lock. */ ret = 0; - if (READ_ONCE(call->state) == RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) goto out_put_unlock; } else { - switch (READ_ONCE(call->state)) { + switch (rxrpc_call_state(call)) { case RXRPC_CALL_UNINITIALISED: case RXRPC_CALL_CLIENT_AWAIT_CONN: case RXRPC_CALL_SERVER_PREALLOC: @@ -675,7 +672,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) break; } - state = READ_ONCE(call->state); + state = rxrpc_call_state(call); _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, state, call->conn); @@ -735,7 +732,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); - switch (READ_ONCE(call->state)) { + switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: case RXRPC_CALL_SERVER_SEND_REPLY: -- cgit v1.2.3 From 2d689424b6184535890c251f937ccf815fde9cd2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Nov 2022 08:35:36 +0000 Subject: rxrpc: Move call state changes from sendmsg to I/O thread Move all the call state changes that are made in rxrpc_sendmsg() to the I/O thread. This is a step towards removing the call state lock. This requires the switch to the RXRPC_CALL_CLIENT_AWAIT_REPLY and RXRPC_CALL_SERVER_SEND_REPLY states to be done when the last packet is decanted from ->tx_sendmsg to ->tx_buffer in the I/O thread, not when it is added to ->tx_sendmsg by sendmsg(). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/call_event.c | 50 ++++++++++++++++++++++++++++++++++-- net/rxrpc/sendmsg.c | 69 ++++++++++---------------------------------------- 2 files changed, 61 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 695aeb70d1a6..2e3c01060d59 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -251,6 +251,50 @@ out: _leave(""); } +/* + * Start transmitting the reply to a service. This cancels the need to ACK the + * request if we haven't yet done so. + */ +static void rxrpc_begin_service_reply(struct rxrpc_call *call) +{ + unsigned long now; + + write_lock(&call->state_lock); + + if (call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { + now = jiffies; + call->state = RXRPC_CALL_SERVER_SEND_REPLY; + WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET); + if (call->ackr_reason == RXRPC_ACK_DELAY) + call->ackr_reason = 0; + trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); + } + + write_unlock(&call->state_lock); +} + +/* + * Close the transmission phase. After this point there is no more data to be + * transmitted in the call. + */ +static void rxrpc_close_tx_phase(struct rxrpc_call *call) +{ + _debug("________awaiting reply/ACK__________"); + + write_lock(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + break; + case RXRPC_CALL_SERVER_SEND_REPLY: + call->state = RXRPC_CALL_SERVER_AWAIT_ACK; + break; + default: + break; + } + write_unlock(&call->state_lock); +} + static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) { unsigned int winsize = min_t(unsigned int, call->tx_winsize, @@ -285,6 +329,9 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) call->tx_top = txb->seq; list_add_tail(&txb->call_link, &call->tx_buffer); + if (txb->wire.flags & RXRPC_LAST_PACKET) + rxrpc_close_tx_phase(call); + rxrpc_transmit_one(call, txb); if (!rxrpc_tx_window_has_space(call)) @@ -298,12 +345,11 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) case RXRPC_CALL_SERVER_ACK_REQUEST: if (list_empty(&call->tx_sendmsg)) return; + rxrpc_begin_service_reply(call); fallthrough; case RXRPC_CALL_SERVER_SEND_REPLY: - case RXRPC_CALL_SERVER_AWAIT_ACK: case RXRPC_CALL_CLIENT_SEND_REQUEST: - case RXRPC_CALL_CLIENT_AWAIT_REPLY: if (!rxrpc_tx_window_has_space(call)) return; if (list_empty(&call->tx_sendmsg)) { diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index f0b5822f3e04..0428528abbf4 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -189,7 +189,6 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { - unsigned long now; rxrpc_seq_t seq = txb->seq; bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags), poke; @@ -212,36 +211,10 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, poke = list_empty(&call->tx_sendmsg); list_add_tail(&txb->call_link, &call->tx_sendmsg); call->tx_prepared = seq; + if (last) + rxrpc_notify_end_tx(rx, call, notify_end_tx); spin_unlock(&call->tx_lock); - if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { - _debug("________awaiting reply/ACK__________"); - write_lock(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_CLIENT_SEND_REQUEST: - call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; - rxrpc_notify_end_tx(rx, call, notify_end_tx); - break; - case RXRPC_CALL_SERVER_ACK_REQUEST: - call->state = RXRPC_CALL_SERVER_SEND_REPLY; - now = jiffies; - WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET); - if (call->ackr_reason == RXRPC_ACK_DELAY) - call->ackr_reason = 0; - trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); - if (!last) - break; - fallthrough; - case RXRPC_CALL_SERVER_SEND_REPLY: - call->state = RXRPC_CALL_SERVER_AWAIT_ACK; - rxrpc_notify_end_tx(rx, call, notify_end_tx); - break; - default: - break; - } - write_unlock(&call->state_lock); - } - if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } @@ -280,8 +253,13 @@ reload: ret = -EPROTO; if (state != RXRPC_CALL_CLIENT_SEND_REQUEST && state != RXRPC_CALL_SERVER_ACK_REQUEST && - state != RXRPC_CALL_SERVER_SEND_REPLY) + state != RXRPC_CALL_SERVER_SEND_REPLY) { + /* Request phase complete for this client call */ + trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, + call->cid, call->call_id, call->rx_consumed, + 0, -EPROTO); goto maybe_error; + } ret = -EMSGSIZE; if (call->tx_total_len != -1) { @@ -573,7 +551,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) { - enum rxrpc_call_state state; struct rxrpc_call *call; unsigned long now, j; bool dropped_lock = false; @@ -672,11 +649,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) break; } - state = rxrpc_call_state(call); - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, state, call->conn); - - if (state >= RXRPC_CALL_COMPLETE) { + if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; } else if (p.command == RXRPC_CMD_SEND_ABORT) { @@ -722,7 +695,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, bool dropped_lock = false; int ret; - _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); + _enter("{%d},", call->debug_id); ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); @@ -732,26 +705,10 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); - switch (rxrpc_call_state(call)) { - case RXRPC_CALL_CLIENT_SEND_REQUEST: - case RXRPC_CALL_SERVER_ACK_REQUEST: - case RXRPC_CALL_SERVER_SEND_REPLY: - ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, - notify_end_tx, &dropped_lock); - break; - case RXRPC_CALL_COMPLETE: - read_lock(&call->state_lock); + ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, + notify_end_tx, &dropped_lock); + if (ret == -ESHUTDOWN) ret = call->error; - read_unlock(&call->state_lock); - break; - default: - /* Request phase complete for this client call */ - trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, - call->cid, call->call_id, call->rx_consumed, - 0, -EPROTO); - ret = -EPROTO; - break; - } if (!dropped_lock) mutex_unlock(&call->user_mutex); -- cgit v1.2.3 From 93368b6bd58ac49d804fdc9ab041a6dc89ebf1cc Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 26 Oct 2022 23:43:00 +0100 Subject: rxrpc: Move call state changes from recvmsg to I/O thread Move the call state changes that are made in rxrpc_recvmsg() to the I/O thread. This means that, thenceforth, only the I/O thread does this and the call state lock can be removed. This requires the Rx phase to be ended when the last packet is received, not when it is processed. Since this now changes the rxrpc call state to SUCCEEDED before we've consumed all the data from it, rxrpc_kernel_check_life() mustn't say the call is dead until the recvmsg queue is empty (unless the call has failed). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/af_rxrpc.c | 10 ++- net/rxrpc/ar-internal.h | 3 +- net/rxrpc/input.c | 38 ++++++++++- net/rxrpc/recvmsg.c | 168 ++++++++++++++++++------------------------------ 4 files changed, 108 insertions(+), 111 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 61c30d0f6735..cf200e4e0eae 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -373,13 +373,17 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * @sock: The socket the call is on * @call: The call to check * - * Allow a kernel service to find out whether a call is still alive - - * ie. whether it has completed. + * Allow a kernel service to find out whether a call is still alive - whether + * it has completed successfully and all received data has been consumed. */ bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) { - return !rxrpc_call_is_complete(call); + if (!rxrpc_call_is_complete(call)) + return true; + if (call->completion != RXRPC_CALL_SUCCEEDED) + return false; + return !skb_queue_empty(&call->recvmsg_queue); } EXPORT_SYMBOL(rxrpc_kernel_check_life); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9e992487649c..861273439736 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -545,7 +545,8 @@ enum rxrpc_call_flag { RXRPC_CALL_KERNEL, /* The call was made by the kernel */ RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */ RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ - RXRPC_CALL_RX_IS_IDLE, /* Reception is idle - send an ACK */ + RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ + RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ }; /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index bd69ff2d9082..6eb21425f41f 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -319,6 +319,41 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) return true; } +/* + * End the packet reception phase. + */ +static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) +{ + rxrpc_seq_t whigh = READ_ONCE(call->rx_highest_seq); + + _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); + + trace_rxrpc_receive(call, rxrpc_receive_end, 0, whigh); + + if (rxrpc_call_state(call) == RXRPC_CALL_CLIENT_RECV_REPLY) + rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); + + write_lock(&call->state_lock); + + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + __rxrpc_call_completed(call); + write_unlock(&call->state_lock); + break; + + case RXRPC_CALL_SERVER_RECV_REQUEST: + call->state = RXRPC_CALL_SERVER_ACK_REQUEST; + call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; + write_unlock(&call->state_lock); + rxrpc_propose_delay_ACK(call, serial, + rxrpc_propose_ack_processing_op); + break; + default: + write_unlock(&call->state_lock); + break; + } +} + static void rxrpc_input_update_ack_window(struct rxrpc_call *call, rxrpc_seq_t window, rxrpc_seq_t wtop) { @@ -337,8 +372,9 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, __skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); - trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); + if (last) + rxrpc_end_rx_phase(call, sp->hdr.serial); } /* diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 7bf36a8839ec..dd54ceee7bcc 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -100,42 +100,6 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) return ret; } -/* - * End the packet reception phase. - */ -static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) -{ - rxrpc_seq_t whigh = READ_ONCE(call->rx_highest_seq); - - _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); - - trace_rxrpc_receive(call, rxrpc_receive_end, 0, whigh); - - if (rxrpc_call_state(call) == RXRPC_CALL_CLIENT_RECV_REPLY) - rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); - - write_lock(&call->state_lock); - - switch (call->state) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - __rxrpc_call_completed(call); - write_unlock(&call->state_lock); - rxrpc_poke_call(call, rxrpc_call_poke_complete); - break; - - case RXRPC_CALL_SERVER_RECV_REQUEST: - call->state = RXRPC_CALL_SERVER_ACK_REQUEST; - call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; - write_unlock(&call->state_lock); - rxrpc_propose_delay_ACK(call, serial, - rxrpc_propose_ack_processing_op); - break; - default: - write_unlock(&call->state_lock); - break; - } -} - /* * Discard a packet we've used up and advance the Rx window by one. */ @@ -166,10 +130,9 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) trace_rxrpc_receive(call, last ? rxrpc_receive_rotate_last : rxrpc_receive_rotate, serial, call->rx_consumed); - if (last) { - rxrpc_end_rx_phase(call, serial); - return; - } + + if (last) + set_bit(RXRPC_CALL_RECVMSG_READ_ALL, &call->flags); /* Check to see if there's an ACK that needs sending. */ acked = atomic_add_return(call->rx_consumed - old_consumed, @@ -194,7 +157,8 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) /* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA - * (returns 1). If more packets are required, it returns -EAGAIN. + * (returns 1). If more packets are required, it returns -EAGAIN and if the + * call has failed it returns -EIO. */ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, struct iov_iter *iter, @@ -210,7 +174,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; - if (rxrpc_call_state(call) >= RXRPC_CALL_SERVER_ACK_REQUEST) { + if (rxrpc_call_has_failed(call)) { + seq = lower_32_bits(atomic64_read(&call->ackr_window)) - 1; + ret = -EIO; + goto done; + } + + if (test_bit(RXRPC_CALL_RECVMSG_READ_ALL, &call->flags)) { seq = lower_32_bits(atomic64_read(&call->ackr_window)) - 1; ret = 1; goto done; @@ -234,14 +204,15 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (rx_pkt_offset == 0) { ret2 = rxrpc_verify_data(call, skb); - rx_pkt_offset = sp->offset; - rx_pkt_len = sp->len; trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, - rx_pkt_offset, rx_pkt_len, ret2); + sp->offset, sp->len, ret2); if (ret2 < 0) { + kdebug("verify = %d", ret2); ret = ret2; goto out; } + rx_pkt_offset = sp->offset; + rx_pkt_len = sp->len; } else { trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); @@ -416,36 +387,36 @@ try_again: msg->msg_namelen = len; } - switch (rxrpc_call_state(call)) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - case RXRPC_CALL_SERVER_RECV_REQUEST: - case RXRPC_CALL_SERVER_ACK_REQUEST: - ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len, - flags, &copied); - if (ret == -EAGAIN) - ret = 0; - - if (!skb_queue_empty(&call->recvmsg_queue)) - rxrpc_notify_socket(call); - break; - default: + ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len, + flags, &copied); + if (ret == -EAGAIN) ret = 0; - break; - } - + if (ret == -EIO) + goto call_failed; if (ret < 0) goto error_unlock_call; - if (rxrpc_call_is_complete(call)) { - ret = rxrpc_recvmsg_term(call, msg); - if (ret < 0) - goto error_unlock_call; - if (!(flags & MSG_PEEK)) - rxrpc_release_call(rx, call); - msg->msg_flags |= MSG_EOR; - ret = 1; - } + if (rxrpc_call_is_complete(call) && + skb_queue_empty(&call->recvmsg_queue)) + goto call_complete; + if (rxrpc_call_has_failed(call)) + goto call_failed; + rxrpc_notify_socket(call); + goto not_yet_complete; + +call_failed: + rxrpc_purge_queue(&call->recvmsg_queue); +call_complete: + ret = rxrpc_recvmsg_term(call, msg); + if (ret < 0) + goto error_unlock_call; + if (!(flags & MSG_PEEK)) + rxrpc_release_call(rx, call); + msg->msg_flags |= MSG_EOR; + ret = 1; + +not_yet_complete: if (ret == 0) msg->msg_flags |= MSG_MORE; else @@ -508,49 +479,34 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, size_t offset = 0; int ret; - _enter("{%d,%s},%zu,%d", - call->debug_id, rxrpc_call_states[call->state], - *_len, want_more); - - ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_SECURING); + _enter("{%d},%zu,%d", call->debug_id, *_len, want_more); mutex_lock(&call->user_mutex); - switch (rxrpc_call_state(call)) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - case RXRPC_CALL_SERVER_RECV_REQUEST: - case RXRPC_CALL_SERVER_ACK_REQUEST: - ret = rxrpc_recvmsg_data(sock, call, NULL, iter, - *_len, 0, &offset); - *_len -= offset; - if (ret < 0) - goto out; - - /* We can only reach here with a partially full buffer if we - * have reached the end of the data. We must otherwise have a - * full buffer or have been given -EAGAIN. - */ - if (ret == 1) { - if (iov_iter_count(iter) > 0) - goto short_data; - if (!want_more) - goto read_phase_complete; - ret = 0; - goto out; - } - - if (!want_more) - goto excess_data; + ret = rxrpc_recvmsg_data(sock, call, NULL, iter, *_len, 0, &offset); + *_len -= offset; + if (ret == -EIO) + goto call_failed; + if (ret < 0) goto out; - case RXRPC_CALL_COMPLETE: - goto call_complete; - - default: - ret = -EINPROGRESS; + /* We can only reach here with a partially full buffer if we have + * reached the end of the data. We must otherwise have a full buffer + * or have been given -EAGAIN. + */ + if (ret == 1) { + if (iov_iter_count(iter) > 0) + goto short_data; + if (!want_more) + goto read_phase_complete; + ret = 0; goto out; } + if (!want_more) + goto excess_data; + goto out; + read_phase_complete: ret = 1; out: @@ -572,7 +528,7 @@ excess_data: 0, -EMSGSIZE); ret = -EMSGSIZE; goto out; -call_complete: +call_failed: *_abort = call->abort_code; ret = call->error; if (call->completion == RXRPC_CALL_SUCCEEDED) { -- cgit v1.2.3 From 96b4059f43ce69e9c590f77d6ce3e99888d5cfe6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Oct 2022 11:25:55 +0100 Subject: rxrpc: Remove call->state_lock All the setters of call->state are now in the I/O thread and thus the state lock is now unnecessary. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 33 +++++++++++------ net/rxrpc/call_accept.c | 2 +- net/rxrpc/call_event.c | 42 +++++++++------------- net/rxrpc/call_object.c | 30 ++++++++-------- net/rxrpc/call_state.c | 87 +++++++++++++++++--------------------------- net/rxrpc/conn_client.c | 10 +++--- net/rxrpc/conn_event.c | 11 ++---- net/rxrpc/input.c | 96 +++++++++++++++++++++---------------------------- net/rxrpc/output.c | 4 +-- net/rxrpc/proc.c | 6 ++-- net/rxrpc/rxkad.c | 2 +- net/rxrpc/sendmsg.c | 3 -- 12 files changed, 142 insertions(+), 184 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 861273439736..751b1903fd6e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -631,14 +631,13 @@ struct rxrpc_call { unsigned long flags; unsigned long events; spinlock_t notify_lock; /* Kernel notification lock */ - rwlock_t state_lock; /* lock for state transition */ unsigned int send_abort_why; /* Why the abort [enum rxrpc_abort_reason] */ s32 send_abort; /* Abort code to be sent */ short send_abort_err; /* Error to be associated with the abort */ rxrpc_seq_t send_abort_seq; /* DATA packet that incurred the abort (or 0) */ s32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ - enum rxrpc_call_state state; /* current state of call */ + enum rxrpc_call_state _state; /* Current state of call (needs barrier) */ enum rxrpc_call_completion completion; /* Call completion condition */ refcount_t ref; u8 security_ix; /* Security type */ @@ -889,25 +888,37 @@ static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) /* * call_state.c */ -bool __rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error); bool rxrpc_set_call_completion(struct rxrpc_call *call, enum rxrpc_call_completion compl, u32 abort_code, int error); -bool __rxrpc_call_completed(struct rxrpc_call *call); bool rxrpc_call_completed(struct rxrpc_call *call); -bool __rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, - u32 abort_code, int error, enum rxrpc_abort_reason why); bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, u32 abort_code, int error, enum rxrpc_abort_reason why); +void rxrpc_prefail_call(struct rxrpc_call *call, enum rxrpc_call_completion compl, + int error); + +static inline void rxrpc_set_call_state(struct rxrpc_call *call, + enum rxrpc_call_state state) +{ + /* Order write of completion info before write of ->state. */ + smp_store_release(&call->_state, state); +} + +static inline enum rxrpc_call_state __rxrpc_call_state(const struct rxrpc_call *call) +{ + return call->_state; /* Only inside I/O thread */ +} + +static inline bool __rxrpc_call_is_complete(const struct rxrpc_call *call) +{ + return __rxrpc_call_state(call) == RXRPC_CALL_COMPLETE; +} static inline enum rxrpc_call_state rxrpc_call_state(const struct rxrpc_call *call) { - /* Order read ->state before read ->error. */ - return smp_load_acquire(&call->state); + /* Order read ->state before read of completion info. */ + return smp_load_acquire(&call->_state); } static inline bool rxrpc_call_is_complete(const struct rxrpc_call *call) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index a132d486dea0..3fbf2fcaaf9e 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -99,7 +99,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, if (!call) return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); - call->state = RXRPC_CALL_SERVER_PREALLOC; + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_PREALLOC); __set_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 2e3c01060d59..1abdef15debc 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -257,20 +257,13 @@ out: */ static void rxrpc_begin_service_reply(struct rxrpc_call *call) { - unsigned long now; - - write_lock(&call->state_lock); - - if (call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { - now = jiffies; - call->state = RXRPC_CALL_SERVER_SEND_REPLY; - WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET); - if (call->ackr_reason == RXRPC_ACK_DELAY) - call->ackr_reason = 0; - trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); - } + unsigned long now = jiffies; - write_unlock(&call->state_lock); + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SEND_REPLY); + WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET); + if (call->ackr_reason == RXRPC_ACK_DELAY) + call->ackr_reason = 0; + trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); } /* @@ -281,18 +274,16 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) { _debug("________awaiting reply/ACK__________"); - write_lock(&call->state_lock); - switch (call->state) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: - call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_AWAIT_REPLY); break; case RXRPC_CALL_SERVER_SEND_REPLY: - call->state = RXRPC_CALL_SERVER_AWAIT_ACK; + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_AWAIT_ACK); break; default: break; } - write_unlock(&call->state_lock); } static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) @@ -341,7 +332,7 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) static void rxrpc_transmit_some_data(struct rxrpc_call *call) { - switch (call->state) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_ACK_REQUEST: if (list_empty(&call->tx_sendmsg)) return; @@ -390,9 +381,10 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) //printk("\n--------------------\n"); _enter("{%d,%s,%lx}", - call->debug_id, rxrpc_call_states[call->state], call->events); + call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)], + call->events); - if (call->state == RXRPC_CALL_COMPLETE) + if (__rxrpc_call_is_complete(call)) goto out; /* Handle abort request locklessly, vs rxrpc_propose_abort(). */ @@ -415,7 +407,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } t = READ_ONCE(call->expect_req_by); - if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST && + if (__rxrpc_call_state(call) == RXRPC_CALL_SERVER_RECV_REQUEST && time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now); expired = true; @@ -499,7 +491,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_lost_ack); - if (resend && call->state != RXRPC_CALL_CLIENT_RECV_REPLY) + if (resend && __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY) rxrpc_resend(call, NULL); if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) @@ -511,7 +503,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_input_data); /* Make sure the timer is restarted */ - if (call->state != RXRPC_CALL_COMPLETE) { + if (!__rxrpc_call_is_complete(call)) { next = call->expect_rx_by; #define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; } @@ -532,7 +524,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } out: - if (call->state == RXRPC_CALL_COMPLETE) { + if (__rxrpc_call_is_complete(call)) { del_timer_sync(&call->timer); if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) rxrpc_disconnect_call(call); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 835e9781afc6..c94161acf3c4 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -69,7 +69,7 @@ static void rxrpc_call_timer_expired(struct timer_list *t) _enter("%d", call->debug_id); - if (call->state < RXRPC_CALL_COMPLETE) { + if (!__rxrpc_call_is_complete(call)) { trace_rxrpc_timer_expired(call, jiffies); rxrpc_poke_call(call, rxrpc_call_poke_timer); } @@ -162,7 +162,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); spin_lock_init(&call->tx_lock); - rwlock_init(&call->state_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; @@ -211,7 +210,6 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, now = ktime_get_real(); call->acks_latest_ts = now; call->cong_tstamp = now; - call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; call->dest_srx = *srx; call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; @@ -227,11 +225,13 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, ret = rxrpc_init_client_call_security(call); if (ret < 0) { - __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); + rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, ret); rxrpc_put_call(call, rxrpc_call_put_discard_error); return ERR_PTR(ret); } + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_AWAIT_CONN); + trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), p->user_call_ID, rxrpc_call_new_client); @@ -384,8 +384,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, error_dup_user_ID: write_unlock(&rx->call_lock); release_sock(&rx->sk); - __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - RX_CALL_DEAD, -EEXIST); + rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, -EEXIST); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0, rxrpc_call_see_userid_exists); rxrpc_release_call(rx, call); @@ -403,8 +402,7 @@ error_attached_to_socket: trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), ret, rxrpc_call_see_connect_failed); set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); - __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - RX_CALL_DEAD, ret); + rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, ret); _leave(" = c=%08x [err]", call->debug_id); return call; } @@ -427,25 +425,25 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->call_id = sp->hdr.callNumber; call->dest_srx.srx_service = sp->hdr.serviceId; call->cid = sp->hdr.cid; - call->state = RXRPC_CALL_SERVER_SECURING; call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: - call->state = RXRPC_CALL_SERVER_SECURING; + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); break; case RXRPC_CONN_SERVICE: - call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: - __rxrpc_set_call_completion(call, conn->completion, - conn->abort_code, conn->error); + rxrpc_set_call_completion(call, conn->completion, + conn->abort_code, conn->error); break; default: BUG(); @@ -614,7 +612,7 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why) dead = __refcount_dec_and_test(&call->ref, &r); trace_rxrpc_call(debug_id, r - 1, 0, why); if (dead) { - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); + ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { spin_lock(&rxnet->call_lock); @@ -677,7 +675,7 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) { memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); + ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); del_timer(&call->timer); @@ -715,7 +713,7 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", call, refcount_read(&call->ref), - rxrpc_call_states[call->state], + rxrpc_call_states[__rxrpc_call_state(call)], call->flags, call->events); spin_unlock(&rxnet->call_lock); diff --git a/net/rxrpc/call_state.c b/net/rxrpc/call_state.c index 649fb9e5d1af..27dc1242b712 100644 --- a/net/rxrpc/call_state.c +++ b/net/rxrpc/call_state.c @@ -10,81 +10,60 @@ /* * Transition a call to the complete state. */ -bool __rxrpc_set_call_completion(struct rxrpc_call *call, +bool rxrpc_set_call_completion(struct rxrpc_call *call, enum rxrpc_call_completion compl, u32 abort_code, int error) { - if (call->state < RXRPC_CALL_COMPLETE) { - call->abort_code = abort_code; - call->error = error; - call->completion = compl; - /* Allow reader of completion state to operate locklessly */ - smp_store_release(&call->state, RXRPC_CALL_COMPLETE); - trace_rxrpc_call_complete(call); - wake_up(&call->waitq); - rxrpc_notify_socket(call); - return true; - } - return false; -} - -bool rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error) -{ - bool ret = false; + if (__rxrpc_call_state(call) == RXRPC_CALL_COMPLETE) + return false; - if (call->state < RXRPC_CALL_COMPLETE) { - write_lock(&call->state_lock); - ret = __rxrpc_set_call_completion(call, compl, abort_code, error); - write_unlock(&call->state_lock); - } - return ret; + call->abort_code = abort_code; + call->error = error; + call->completion = compl; + /* Allow reader of completion state to operate locklessly */ + rxrpc_set_call_state(call, RXRPC_CALL_COMPLETE); + trace_rxrpc_call_complete(call); + wake_up(&call->waitq); + rxrpc_notify_socket(call); + return true; } /* * Record that a call successfully completed. */ -bool __rxrpc_call_completed(struct rxrpc_call *call) -{ - return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); -} - bool rxrpc_call_completed(struct rxrpc_call *call) { - bool ret = false; - - if (call->state < RXRPC_CALL_COMPLETE) { - write_lock(&call->state_lock); - ret = __rxrpc_call_completed(call); - write_unlock(&call->state_lock); - } - return ret; + return rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); } /* * Record that a call is locally aborted. */ -bool __rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, - u32 abort_code, int error, enum rxrpc_abort_reason why) +bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why) { trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, abort_code, error); - return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, - abort_code, error); + if (!rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, + abort_code, error)) + return false; + if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) + rxrpc_send_abort_packet(call); + return true; } -bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, - u32 abort_code, int error, enum rxrpc_abort_reason why) +/* + * Record that a call errored out before even getting off the ground, thereby + * setting the state to allow it to be destroyed. + */ +void rxrpc_prefail_call(struct rxrpc_call *call, enum rxrpc_call_completion compl, + int error) { - bool ret; - - write_lock(&call->state_lock); - ret = __rxrpc_abort_call(call, seq, abort_code, error, why); - write_unlock(&call->state_lock); - if (ret && test_bit(RXRPC_CALL_EXPOSED, &call->flags)) - rxrpc_send_abort_packet(call); - return ret; + call->abort_code = RX_CALL_DEAD; + call->error = error; + call->completion = compl; + call->_state = RXRPC_CALL_COMPLETE; + trace_rxrpc_call_complete(call); + __set_bit(RXRPC_CALL_RELEASED, &call->flags); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index c0db7722571e..8b5ea68dc47e 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -553,9 +553,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, trace_rxrpc_connect_call(call); - write_lock(&call->state_lock); - call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; - write_unlock(&call->state_lock); + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_SEND_REQUEST); /* Paired with the read barrier in rxrpc_connect_call(). This orders * cid and epoch in the connection wrt to call_id without the need to @@ -687,7 +685,7 @@ static int rxrpc_wait_for_channel(struct rxrpc_bundle *bundle, set_current_state(TASK_UNINTERRUPTIBLE); break; } - if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_AWAIT_CONN) + if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) break; if ((call->interruptibility == RXRPC_INTERRUPTIBLE || call->interruptibility == RXRPC_PREINTERRUPTIBLE) && @@ -729,7 +727,7 @@ int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) goto out; } - if (call->state == RXRPC_CALL_CLIENT_AWAIT_CONN) { + if (rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_CONN) { ret = rxrpc_wait_for_channel(bundle, call, gfp); if (ret < 0) goto wait_failed; @@ -748,7 +746,7 @@ wait_failed: list_del_init(&call->chan_wait_link); spin_unlock(&bundle->channel_lock); - if (call->state != RXRPC_CALL_CLIENT_AWAIT_CONN) { + if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) { ret = 0; goto granted_channel; } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index b2042702ca9a..8d0b9ff0a5e1 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -230,14 +230,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { - _enter("%p", call); - if (call) { - write_lock(&call->state_lock); - if (call->state == RXRPC_CALL_SERVER_SECURING) { - call->state = RXRPC_CALL_SERVER_RECV_REQUEST; - rxrpc_notify_socket(call); - } - write_unlock(&call->state_lock); + if (call && __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SECURING) { + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); + rxrpc_notify_socket(call); } } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 6eb21425f41f..367927a99881 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -184,7 +184,7 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) if (call->cong_mode != RXRPC_CALL_SLOW_START && call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE) return; - if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) + if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY) return; rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8)); @@ -252,43 +252,31 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, enum rxrpc_abort_reason abort_why) { - unsigned int state; - ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); - write_lock(&call->state_lock); - - state = call->state; - switch (state) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_CLIENT_AWAIT_REPLY: - if (reply_begun) - call->state = state = RXRPC_CALL_CLIENT_RECV_REPLY; - else - call->state = state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + if (reply_begun) { + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_RECV_REPLY); + trace_rxrpc_txqueue(call, rxrpc_txqueue_end); + break; + } + + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_AWAIT_REPLY); + trace_rxrpc_txqueue(call, rxrpc_txqueue_await_reply); break; case RXRPC_CALL_SERVER_AWAIT_ACK: - __rxrpc_call_completed(call); - state = call->state; + rxrpc_call_completed(call); + trace_rxrpc_txqueue(call, rxrpc_txqueue_end); break; default: - goto bad_state; + kdebug("end_tx %s", rxrpc_call_states[__rxrpc_call_state(call)]); + rxrpc_proto_abort(call, call->tx_top, abort_why); + break; } - - write_unlock(&call->state_lock); - if (state == RXRPC_CALL_CLIENT_AWAIT_REPLY) - trace_rxrpc_txqueue(call, rxrpc_txqueue_await_reply); - else - trace_rxrpc_txqueue(call, rxrpc_txqueue_end); - _leave(" = ok"); - return; - -bad_state: - write_unlock(&call->state_lock); - kdebug("end_tx %s", rxrpc_call_states[call->state]); - rxrpc_proto_abort(call, call->tx_top, abort_why); } /* @@ -303,7 +291,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) if (call->ackr_reason) { now = jiffies; timo = now + MAX_JIFFY_OFFSET; - WRITE_ONCE(call->resend_at, timo); + WRITE_ONCE(call->delay_ack_at, timo); trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now); } @@ -326,30 +314,23 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) { rxrpc_seq_t whigh = READ_ONCE(call->rx_highest_seq); - _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); + _enter("%d,%s", call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)]); trace_rxrpc_receive(call, rxrpc_receive_end, 0, whigh); - if (rxrpc_call_state(call) == RXRPC_CALL_CLIENT_RECV_REPLY) - rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); - - write_lock(&call->state_lock); - - switch (call->state) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_RECV_REPLY: - __rxrpc_call_completed(call); - write_unlock(&call->state_lock); + rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); + rxrpc_call_completed(call); break; case RXRPC_CALL_SERVER_RECV_REQUEST: - call->state = RXRPC_CALL_SERVER_ACK_REQUEST; + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_ACK_REQUEST); call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; - write_unlock(&call->state_lock); - rxrpc_propose_delay_ACK(call, serial, - rxrpc_propose_ack_processing_op); + rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_processing_op); break; + default: - write_unlock(&call->state_lock); break; } } @@ -583,7 +564,6 @@ protocol_error: static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - enum rxrpc_call_state state; rxrpc_serial_t serial = sp->hdr.serial; rxrpc_seq_t seq0 = sp->hdr.seq; @@ -591,11 +571,20 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) atomic64_read(&call->ackr_window), call->rx_highest_seq, skb->len, seq0); - state = READ_ONCE(call->state); - if (state >= RXRPC_CALL_COMPLETE) + if (__rxrpc_call_is_complete(call)) return; - if (state == RXRPC_CALL_SERVER_RECV_REQUEST) { + switch (__rxrpc_call_state(call)) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + /* Received data implicitly ACKs all of the request + * packets we sent when we're acting as a client. + */ + if (!rxrpc_receiving_reply(call)) + goto out_notify; + break; + + case RXRPC_CALL_SERVER_RECV_REQUEST: { unsigned long timo = READ_ONCE(call->next_req_timo); unsigned long now, expect_req_by; @@ -606,15 +595,12 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_reduce_call_timer(call, expect_req_by, now, rxrpc_timer_set_for_idle); } + break; } - /* Received data implicitly ACKs all of the request packets we sent - * when we're acting as a client. - */ - if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST || - state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && - !rxrpc_receiving_reply(call)) - goto out_notify; + default: + break; + } if (!rxrpc_input_split_jumbo(call, skb)) { rxrpc_proto_abort(call, sp->hdr.seq, rxrpc_badmsg_bad_jumbo); @@ -904,7 +890,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); /* Ignore ACKs unless we are or have just been transmitting. */ - switch (READ_ONCE(call->state)) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_CLIENT_AWAIT_REPLY: case RXRPC_CALL_SERVER_SEND_REPLY: @@ -1027,7 +1013,7 @@ void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb) */ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) { - switch (READ_ONCE(call->state)) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_AWAIT_ACK: rxrpc_call_completed(call); fallthrough; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8a5ff2c9e061..a9746be29634 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -261,7 +261,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) rxrpc_tx_point_call_ack); rxrpc_tx_backoff(call, ret); - if (call->state < RXRPC_CALL_COMPLETE) { + if (!__rxrpc_call_is_complete(call)) { if (ret < 0) rxrpc_cancel_rtt_probe(call, serial, rtt_slot); rxrpc_set_keepalive(call); @@ -723,7 +723,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) static inline void rxrpc_instant_resend(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { - if (call->state < RXRPC_CALL_COMPLETE) + if (!__rxrpc_call_is_complete(call)) kdebug("resend"); } diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 63947cce4048..c39ef94602ed 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -50,6 +50,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_local *local; struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + enum rxrpc_call_state state; unsigned long timeout = 0; rxrpc_seq_t acks_hard_ack; char lbuff[50], rbuff[50]; @@ -74,7 +75,8 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) sprintf(rbuff, "%pISpc", &call->dest_srx.transport); - if (call->state != RXRPC_CALL_SERVER_PREALLOC) { + state = rxrpc_call_state(call); + if (state != RXRPC_CALL_SERVER_PREALLOC) { timeout = READ_ONCE(call->expect_rx_by); timeout -= jiffies; } @@ -91,7 +93,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call->call_id, rxrpc_is_service_call(call) ? "Svc" : "Clt", refcount_read(&call->ref), - rxrpc_call_states[call->state], + rxrpc_call_states[state], call->abort_code, call->debug_id, acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index e52cb8058156..dfb01e7b90fb 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -1143,7 +1143,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, call = rcu_dereference_protected( conn->channels[i].call, lockdep_is_held(&conn->bundle->channel_lock)); - if (call && call->state < RXRPC_CALL_COMPLETE) { + if (call && !__rxrpc_call_is_complete(call)) { rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, rxkad_abort_resp_call_state); goto protocol_error_unlock; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 0428528abbf4..a5d0005b7ce5 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -702,9 +702,6 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, mutex_lock(&call->user_mutex); - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); - ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, notify_end_tx, &dropped_lock); if (ret == -ESHUTDOWN) -- cgit v1.2.3 From 0d6bf319bc5aba4535bb46e1b607973688a2248a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 2 Nov 2022 16:46:13 +0000 Subject: rxrpc: Move the client conn cache management to the I/O thread Move the management of the client connection cache to the I/O thread rather than managing it from the namespace as an aggregate across all the local endpoints within the namespace. This will allow a load of locking to be got rid of in a future patch as only the I/O thread will be looking at the this. The downside is that the total number of cached connections on the system can get higher because the limit is now per-local rather than per-netns. We can, however, keep the number of client conns in use across the entire netfs and use that to reduce the expiration time of idle connection. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 17 ++++----- net/rxrpc/conn_client.c | 92 +++++++++++++++++------------------------------- net/rxrpc/conn_object.c | 1 - net/rxrpc/io_thread.c | 4 +++ net/rxrpc/local_object.c | 17 +++++++++ net/rxrpc/net_ns.c | 17 --------- 6 files changed, 62 insertions(+), 86 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 751b1903fd6e..de84061a5447 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -76,13 +76,7 @@ struct rxrpc_net { bool live; - bool kill_all_client_conns; atomic_t nr_client_conns; - spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ - struct mutex client_conn_discard_lock; /* Prevent multiple discarders */ - struct list_head idle_client_conns; - struct work_struct client_conn_reaper; - struct timer_list client_conn_reap_timer; struct hlist_head local_endpoints; struct mutex local_mutex; /* Lock for ->local_endpoints */ @@ -294,8 +288,16 @@ struct rxrpc_local { struct sk_buff_head rx_queue; /* Received packets */ struct list_head conn_attend_q; /* Conns requiring immediate attention */ struct list_head call_attend_q; /* Calls requiring immediate attention */ + struct rb_root client_bundles; /* Client connection bundles by socket params */ spinlock_t client_bundles_lock; /* Lock for client_bundles */ + bool kill_all_client_conns; + spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ + struct list_head idle_client_conns; + struct timer_list client_conn_reap_timer; + unsigned long client_conn_flags; +#define RXRPC_CLIENT_CONN_REAP_TIMER 0 /* The client conn reap timer expired */ + spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ @@ -946,8 +948,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); void rxrpc_put_client_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); -void rxrpc_discard_expired_client_conns(struct work_struct *); -void rxrpc_destroy_all_client_connections(struct rxrpc_net *); +void rxrpc_discard_expired_client_conns(struct rxrpc_local *local); void rxrpc_clean_up_local_conns(struct rxrpc_local *); /* diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 8b5ea68dc47e..ebb43f65ebc5 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -578,17 +578,17 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, */ static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connection *conn) { - struct rxrpc_net *rxnet = bundle->local->rxnet; + struct rxrpc_local *local = bundle->local; bool drop_ref; if (!list_empty(&conn->cache_link)) { drop_ref = false; - spin_lock(&rxnet->client_conn_cache_lock); + spin_lock(&local->client_conn_cache_lock); if (!list_empty(&conn->cache_link)) { list_del_init(&conn->cache_link); drop_ref = true; } - spin_unlock(&rxnet->client_conn_cache_lock); + spin_unlock(&local->client_conn_cache_lock); if (drop_ref) rxrpc_put_connection(conn, rxrpc_conn_put_unidle); } @@ -710,14 +710,10 @@ out: int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_bundle *bundle; - struct rxrpc_local *local = call->local; - struct rxrpc_net *rxnet = local->rxnet; int ret = 0; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper); - rxrpc_get_call(call, rxrpc_call_get_io_thread); bundle = rxrpc_prep_call(call, gfp); @@ -787,14 +783,14 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) /* * Set the reap timer. */ -static void rxrpc_set_client_reap_timer(struct rxrpc_net *rxnet) +static void rxrpc_set_client_reap_timer(struct rxrpc_local *local) { - if (!rxnet->kill_all_client_conns) { + if (!local->kill_all_client_conns) { unsigned long now = jiffies; unsigned long reap_at = now + rxrpc_conn_idle_client_expiry; - if (rxnet->live) - timer_reduce(&rxnet->client_conn_reap_timer, reap_at); + if (local->rxnet->live) + timer_reduce(&local->client_conn_reap_timer, reap_at); } } @@ -805,7 +801,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call { struct rxrpc_connection *conn; struct rxrpc_channel *chan = NULL; - struct rxrpc_net *rxnet = bundle->local->rxnet; + struct rxrpc_local *local = bundle->local; unsigned int channel; bool may_reuse; u32 cid; @@ -895,11 +891,11 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call conn->idle_timestamp = jiffies; rxrpc_get_connection(conn, rxrpc_conn_get_idle); - spin_lock(&rxnet->client_conn_cache_lock); - list_move_tail(&conn->cache_link, &rxnet->idle_client_conns); - spin_unlock(&rxnet->client_conn_cache_lock); + spin_lock(&local->client_conn_cache_lock); + list_move_tail(&conn->cache_link, &local->idle_client_conns); + spin_unlock(&local->client_conn_cache_lock); - rxrpc_set_client_reap_timer(rxnet); + rxrpc_set_client_reap_timer(local); } out: @@ -986,42 +982,34 @@ void rxrpc_kill_client_conn(struct rxrpc_connection *conn) * This may be called from conn setup or from a work item so cannot be * considered non-reentrant. */ -void rxrpc_discard_expired_client_conns(struct work_struct *work) +void rxrpc_discard_expired_client_conns(struct rxrpc_local *local) { struct rxrpc_connection *conn; - struct rxrpc_net *rxnet = - container_of(work, struct rxrpc_net, client_conn_reaper); unsigned long expiry, conn_expires_at, now; unsigned int nr_conns; _enter(""); - if (list_empty(&rxnet->idle_client_conns)) { + if (list_empty(&local->idle_client_conns)) { _leave(" [empty]"); return; } - /* Don't double up on the discarding */ - if (!mutex_trylock(&rxnet->client_conn_discard_lock)) { - _leave(" [already]"); - return; - } - /* We keep an estimate of what the number of conns ought to be after * we've discarded some so that we don't overdo the discarding. */ - nr_conns = atomic_read(&rxnet->nr_client_conns); + nr_conns = atomic_read(&local->rxnet->nr_client_conns); next: - spin_lock(&rxnet->client_conn_cache_lock); + spin_lock(&local->client_conn_cache_lock); - if (list_empty(&rxnet->idle_client_conns)) + if (list_empty(&local->idle_client_conns)) goto out; - conn = list_entry(rxnet->idle_client_conns.next, + conn = list_entry(local->idle_client_conns.next, struct rxrpc_connection, cache_link); - if (!rxnet->kill_all_client_conns) { + if (!local->kill_all_client_conns) { /* If the number of connections is over the reap limit, we * expedite discard by reducing the expiry timeout. We must, * however, have at least a short grace period to be able to do @@ -1044,7 +1032,7 @@ next: trace_rxrpc_client(conn, -1, rxrpc_client_discard); list_del_init(&conn->cache_link); - spin_unlock(&rxnet->client_conn_cache_lock); + spin_unlock(&local->client_conn_cache_lock); rxrpc_unbundle_conn(conn); /* Drop the ->cache_link ref */ @@ -1062,32 +1050,11 @@ not_yet_expired: * then things get messier. */ _debug("not yet"); - if (!rxnet->kill_all_client_conns) - timer_reduce(&rxnet->client_conn_reap_timer, conn_expires_at); + if (!local->kill_all_client_conns) + timer_reduce(&local->client_conn_reap_timer, conn_expires_at); out: - spin_unlock(&rxnet->client_conn_cache_lock); - mutex_unlock(&rxnet->client_conn_discard_lock); - _leave(""); -} - -/* - * Preemptively destroy all the client connection records rather than waiting - * for them to time out - */ -void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet) -{ - _enter(""); - - spin_lock(&rxnet->client_conn_cache_lock); - rxnet->kill_all_client_conns = true; - spin_unlock(&rxnet->client_conn_cache_lock); - - del_timer_sync(&rxnet->client_conn_reap_timer); - - if (!rxrpc_queue_work(&rxnet->client_conn_reaper)) - _debug("destroy: queue failed"); - + spin_unlock(&local->client_conn_cache_lock); _leave(""); } @@ -1097,14 +1064,19 @@ void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet) void rxrpc_clean_up_local_conns(struct rxrpc_local *local) { struct rxrpc_connection *conn, *tmp; - struct rxrpc_net *rxnet = local->rxnet; LIST_HEAD(graveyard); _enter(""); - spin_lock(&rxnet->client_conn_cache_lock); + spin_lock(&local->client_conn_cache_lock); + local->kill_all_client_conns = true; + spin_unlock(&local->client_conn_cache_lock); + + del_timer_sync(&local->client_conn_reap_timer); + + spin_lock(&local->client_conn_cache_lock); - list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns, + list_for_each_entry_safe(conn, tmp, &local->idle_client_conns, cache_link) { if (conn->local == local) { atomic_dec(&conn->active); @@ -1113,7 +1085,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) } } - spin_unlock(&rxnet->client_conn_cache_lock); + spin_unlock(&local->client_conn_cache_lock); while (!list_empty(&graveyard)) { conn = list_entry(graveyard.next, diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 2a7d5378300c..3d8c1dc6a82a 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -470,7 +470,6 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) _enter(""); atomic_dec(&rxnet->nr_conns); - rxrpc_destroy_all_client_connections(rxnet); del_timer_sync(&rxnet->service_conn_reap_timer); rxrpc_queue_work(&rxnet->service_conn_reaper); diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 751139b3c1ac..a299cc34c140 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -435,6 +435,10 @@ int rxrpc_io_thread(void *data) continue; } + if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, + &local->client_conn_flags)) + rxrpc_discard_expired_client_conns(local); + /* Deal with calls that want immediate attention. */ if ((call = list_first_entry_or_null(&local->call_attend_q, struct rxrpc_call, diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index ca8b3ee68b59..9bc8d08ca12c 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -82,6 +82,16 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local, } } +static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) +{ + struct rxrpc_local *local = + container_of(timer, struct rxrpc_local, client_conn_reap_timer); + + if (local->kill_all_client_conns && + test_and_set_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) + rxrpc_wake_up_io_thread(local); +} + /* * Allocate a new local endpoint. */ @@ -103,8 +113,15 @@ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, skb_queue_head_init(&local->rx_queue); INIT_LIST_HEAD(&local->conn_attend_q); INIT_LIST_HEAD(&local->call_attend_q); + local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); + local->kill_all_client_conns = false; + spin_lock_init(&local->client_conn_cache_lock); + INIT_LIST_HEAD(&local->idle_client_conns); + timer_setup(&local->client_conn_reap_timer, + rxrpc_client_conn_reap_timeout, 0); + spin_lock_init(&local->lock); rwlock_init(&local->services_lock); local->debug_id = atomic_inc_return(&rxrpc_debug_id); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 5905530e2f33..a0319c040c25 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -10,15 +10,6 @@ unsigned int rxrpc_net_id; -static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) -{ - struct rxrpc_net *rxnet = - container_of(timer, struct rxrpc_net, client_conn_reap_timer); - - if (rxnet->live) - rxrpc_queue_work(&rxnet->client_conn_reaper); -} - static void rxrpc_service_conn_reap_timeout(struct timer_list *timer) { struct rxrpc_net *rxnet = @@ -63,14 +54,6 @@ static __net_init int rxrpc_init_net(struct net *net) rxrpc_service_conn_reap_timeout, 0); atomic_set(&rxnet->nr_client_conns, 0); - rxnet->kill_all_client_conns = false; - spin_lock_init(&rxnet->client_conn_cache_lock); - mutex_init(&rxnet->client_conn_discard_lock); - INIT_LIST_HEAD(&rxnet->idle_client_conns); - INIT_WORK(&rxnet->client_conn_reaper, - rxrpc_discard_expired_client_conns); - timer_setup(&rxnet->client_conn_reap_timer, - rxrpc_client_conn_reap_timeout, 0); INIT_HLIST_HEAD(&rxnet->local_endpoints); mutex_init(&rxnet->local_mutex); -- cgit v1.2.3 From 9d35d880e0e4a3ab32d8c12f9e4d76198aadd42d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Oct 2022 09:45:43 +0100 Subject: rxrpc: Move client call connection to the I/O thread Move the connection setup of client calls to the I/O thread so that a whole load of locking and barrierage can be eliminated. This necessitates the app thread waiting for connection to complete before it can begin encrypting data. This also completes the fix for a race that exists between call connection and call disconnection whereby the data transmission code adds the call to the peer error distribution list after the call has been disconnected (say by the rxrpc socket getting closed). The fix is to complete the process of moving call connection, data transmission and call disconnection into the I/O thread and thus forcibly serialising them. Note that the issue may predate the overhaul to an I/O thread model that were included in the merge window for v6.2, but the timing is very much changed by the change given below. Fixes: cf37b5987508 ("rxrpc: Move DATA transmission into call processor work item") Reported-by: syzbot+c22650d2844392afdcfd@syzkaller.appspotmail.com Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 22 +- net/rxrpc/call_object.c | 58 ++++-- net/rxrpc/call_state.c | 2 +- net/rxrpc/conn_client.c | 533 +++++++++++------------------------------------ net/rxrpc/conn_event.c | 49 +---- net/rxrpc/conn_object.c | 19 +- net/rxrpc/conn_service.c | 1 - net/rxrpc/io_thread.c | 13 +- net/rxrpc/local_object.c | 6 +- net/rxrpc/proc.c | 1 + net/rxrpc/rxkad.c | 21 +- net/rxrpc/security.c | 33 ++- net/rxrpc/sendmsg.c | 64 ++++++ 13 files changed, 295 insertions(+), 527 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index de84061a5447..007258538bee 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -292,7 +292,6 @@ struct rxrpc_local { struct rb_root client_bundles; /* Client connection bundles by socket params */ spinlock_t client_bundles_lock; /* Lock for client_bundles */ bool kill_all_client_conns; - spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ struct list_head idle_client_conns; struct timer_list client_conn_reap_timer; unsigned long client_conn_flags; @@ -304,7 +303,8 @@ struct rxrpc_local { bool dead; bool service_closed; /* Service socket closed */ struct idr conn_ids; /* List of connection IDs */ - spinlock_t conn_lock; /* Lock for client connection pool */ + struct list_head new_client_calls; /* Newly created client calls need connection */ + spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ }; @@ -385,7 +385,6 @@ enum rxrpc_call_completion { * Bits in the connection flags. */ enum rxrpc_conn_flag { - RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */ @@ -413,6 +412,7 @@ enum rxrpc_conn_event { */ enum rxrpc_conn_proto_state { RXRPC_CONN_UNUSED, /* Connection not yet attempted */ + RXRPC_CONN_CLIENT_UNSECURED, /* Client connection needs security init */ RXRPC_CONN_CLIENT, /* Client connection */ RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */ RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ @@ -436,11 +436,9 @@ struct rxrpc_bundle { u32 security_level; /* Security level selected */ u16 service_id; /* Service ID for this connection */ bool try_upgrade; /* True if the bundle is attempting upgrade */ - bool alloc_conn; /* True if someone's getting a conn */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ - short alloc_error; /* Error from last conn allocation */ - spinlock_t channel_lock; + unsigned short alloc_error; /* Error from last conn allocation */ struct rb_node local_node; /* Node in local->client_conns */ struct list_head waiting_calls; /* Calls waiting for channels */ unsigned long avail_chans; /* Mask of available channels */ @@ -468,7 +466,7 @@ struct rxrpc_connection { unsigned char act_chans; /* Mask of active channels */ struct rxrpc_channel { unsigned long final_ack_at; /* Time at which to issue final ACK */ - struct rxrpc_call __rcu *call; /* Active call */ + struct rxrpc_call *call; /* Active call */ unsigned int call_debug_id; /* call->debug_id */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ @@ -489,6 +487,7 @@ struct rxrpc_connection { struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ + struct mutex security_lock; /* Lock for security management */ const struct rxrpc_security *security; /* applied security module */ union { struct { @@ -619,7 +618,7 @@ struct rxrpc_call { struct work_struct destroyer; /* In-process-context destroyer */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ - struct list_head chan_wait_link; /* Link in conn->bundle->waiting_calls */ + struct list_head wait_link; /* Link in local->new_client_calls */ struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* Link in rx->acceptq */ struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ @@ -866,6 +865,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct sockaddr_rxrpc *, struct rxrpc_call_params *, gfp_t, unsigned int); +void rxrpc_start_call_timer(struct rxrpc_call *call); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); @@ -905,6 +905,7 @@ static inline void rxrpc_set_call_state(struct rxrpc_call *call, { /* Order write of completion info before write of ->state. */ smp_store_release(&call->_state, state); + wake_up(&call->waitq); } static inline enum rxrpc_call_state __rxrpc_call_state(const struct rxrpc_call *call) @@ -940,10 +941,11 @@ extern unsigned int rxrpc_reap_client_connections; extern unsigned long rxrpc_conn_idle_client_expiry; extern unsigned long rxrpc_conn_idle_client_fast_expiry; -void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local); +void rxrpc_purge_client_connections(struct rxrpc_local *local); struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); void rxrpc_put_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); -int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp); +int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp); +void rxrpc_connect_client_calls(struct rxrpc_local *local); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c94161acf3c4..3ded5a24627c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -150,7 +150,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, timer_setup(&call->timer, rxrpc_call_timer_expired, 0); INIT_WORK(&call->destroyer, rxrpc_destroy_call); INIT_LIST_HEAD(&call->link); - INIT_LIST_HEAD(&call->chan_wait_link); + INIT_LIST_HEAD(&call->wait_link); INIT_LIST_HEAD(&call->accept_link); INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); @@ -242,7 +242,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, /* * Initiate the call ack/resend/expiry timer. */ -static void rxrpc_start_call_timer(struct rxrpc_call *call) +void rxrpc_start_call_timer(struct rxrpc_call *call) { unsigned long now = jiffies; unsigned long j = now + MAX_JIFFY_OFFSET; @@ -286,6 +286,39 @@ static void rxrpc_put_call_slot(struct rxrpc_call *call) up(limiter); } +/* + * Start the process of connecting a call. We obtain a peer and a connection + * bundle, but the actual association of a call with a connection is offloaded + * to the I/O thread to simplify locking. + */ +static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) +{ + struct rxrpc_local *local = call->local; + int ret = 0; + + _enter("{%d,%lx},", call->debug_id, call->user_call_ID); + + call->peer = rxrpc_lookup_peer(local, &call->dest_srx, gfp); + if (!call->peer) + goto error; + + ret = rxrpc_look_up_bundle(call, gfp); + if (ret < 0) + goto error; + + trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); + rxrpc_get_call(call, rxrpc_call_get_io_thread); + spin_lock(&local->client_call_lock); + list_add_tail(&call->wait_link, &local->new_client_calls); + spin_unlock(&local->client_call_lock); + rxrpc_wake_up_io_thread(local); + return 0; + +error: + __set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); + return ret; +} + /* * Set up a call for the given parameters. * - Called with the socket lock held, which it must release. @@ -369,10 +402,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (ret < 0) goto error_attached_to_socket; - rxrpc_see_call(call, rxrpc_call_see_connected); - - rxrpc_start_call_timer(call); - _leave(" = %p [new]", call); return call; @@ -387,22 +416,20 @@ error_dup_user_ID: rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, -EEXIST); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0, rxrpc_call_see_userid_exists); - rxrpc_release_call(rx, call); mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_userid_exists); _leave(" = -EEXIST"); return ERR_PTR(-EEXIST); /* We got an error, but the call is attached to the socket and is in - * need of release. However, we might now race with recvmsg() when - * completing the call queues it. Return 0 from sys_sendmsg() and + * need of release. However, we might now race with recvmsg() when it + * completion notifies the socket. Return 0 from sys_sendmsg() and * leave the error to recvmsg() to deal with. */ error_attached_to_socket: trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), ret, rxrpc_call_see_connect_failed); - set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); - rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, ret); + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); _leave(" = c=%08x [err]", call->debug_id); return call; } @@ -460,7 +487,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, chan = sp->hdr.cid & RXRPC_CHANNELMASK; conn->channels[chan].call_counter = call->call_id; conn->channels[chan].call_id = call->call_id; - rcu_assign_pointer(conn->channels[chan].call, call); + conn->channels[chan].call = call; spin_unlock(&conn->state_lock); spin_lock(&conn->peer->lock); @@ -520,7 +547,7 @@ static void rxrpc_cleanup_ring(struct rxrpc_call *call) void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; - bool put = false; + bool put = false, putu = false; _enter("{%d,%d}", call->debug_id, refcount_read(&call->ref)); @@ -555,7 +582,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { rb_erase(&call->sock_node, &rx->calls); memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); - rxrpc_put_call(call, rxrpc_call_put_userid_exists); + putu = true; } list_del(&call->sock_link); @@ -563,6 +590,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); + if (putu) + rxrpc_put_call(call, rxrpc_call_put_userid); + _leave(""); } diff --git a/net/rxrpc/call_state.c b/net/rxrpc/call_state.c index 27dc1242b712..6afb54373ebb 100644 --- a/net/rxrpc/call_state.c +++ b/net/rxrpc/call_state.c @@ -65,5 +65,5 @@ void rxrpc_prefail_call(struct rxrpc_call *call, enum rxrpc_call_completion comp call->completion = compl; call->_state = RXRPC_CALL_COMPLETE; trace_rxrpc_call_complete(call); - __set_bit(RXRPC_CALL_RELEASED, &call->flags); + WARN_ON_ONCE(__test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index ebb43f65ebc5..981ca5b98bcb 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -39,61 +39,19 @@ static void rxrpc_activate_bundle(struct rxrpc_bundle *bundle) atomic_inc(&bundle->active); } -/* - * Get a connection ID and epoch for a client connection from the global pool. - * The connection struct pointer is then recorded in the idr radix tree. The - * epoch doesn't change until the client is rebooted (or, at least, unless the - * module is unloaded). - */ -static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, - gfp_t gfp) -{ - struct rxrpc_local *local = conn->local; - int id; - - _enter(""); - - idr_preload(gfp); - spin_lock(&local->conn_lock); - - id = idr_alloc_cyclic(&local->conn_ids, conn, - 1, 0x40000000, GFP_NOWAIT); - if (id < 0) - goto error; - - spin_unlock(&local->conn_lock); - idr_preload_end(); - - conn->proto.epoch = local->rxnet->epoch; - conn->proto.cid = id << RXRPC_CIDSHIFT; - set_bit(RXRPC_CONN_HAS_IDR, &conn->flags); - _leave(" [CID %x]", conn->proto.cid); - return 0; - -error: - spin_unlock(&local->conn_lock); - idr_preload_end(); - _leave(" = %d", id); - return id; -} - /* * Release a connection ID for a client connection. */ static void rxrpc_put_client_connection_id(struct rxrpc_local *local, struct rxrpc_connection *conn) { - if (test_bit(RXRPC_CONN_HAS_IDR, &conn->flags)) { - spin_lock(&local->conn_lock); - idr_remove(&local->conn_ids, conn->proto.cid >> RXRPC_CIDSHIFT); - spin_unlock(&local->conn_lock); - } + idr_remove(&local->conn_ids, conn->proto.cid >> RXRPC_CIDSHIFT); } /* * Destroy the client connection ID tree. */ -void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local) +static void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local) { struct rxrpc_connection *conn; int id; @@ -129,7 +87,6 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, bundle->security_level = call->security_level; refcount_set(&bundle->ref, 1); atomic_set(&bundle->active, 1); - spin_lock_init(&bundle->channel_lock); INIT_LIST_HEAD(&bundle->waiting_calls); trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_new); } @@ -169,69 +126,68 @@ void rxrpc_put_bundle(struct rxrpc_bundle *bundle, enum rxrpc_bundle_trace why) } } +/* + * Get rid of outstanding client connection preallocations when a local + * endpoint is destroyed. + */ +void rxrpc_purge_client_connections(struct rxrpc_local *local) +{ + rxrpc_destroy_client_conn_ids(local); +} + /* * Allocate a client connection. */ static struct rxrpc_connection * -rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) +rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle) { struct rxrpc_connection *conn; - struct rxrpc_net *rxnet = bundle->local->rxnet; - int ret; + struct rxrpc_local *local = bundle->local; + struct rxrpc_net *rxnet = local->rxnet; + int id; _enter(""); - conn = rxrpc_alloc_connection(rxnet, gfp); - if (!conn) { - _leave(" = -ENOMEM"); + conn = rxrpc_alloc_connection(rxnet, GFP_ATOMIC | __GFP_NOWARN); + if (!conn) return ERR_PTR(-ENOMEM); + + id = idr_alloc_cyclic(&local->conn_ids, conn, 1, 0x40000000, + GFP_ATOMIC | __GFP_NOWARN); + if (id < 0) { + kfree(conn); + return ERR_PTR(id); } refcount_set(&conn->ref, 1); - conn->bundle = bundle; - conn->local = bundle->local; - conn->peer = bundle->peer; - conn->key = bundle->key; + conn->proto.cid = id << RXRPC_CIDSHIFT; + conn->proto.epoch = local->rxnet->epoch; + conn->out_clientflag = RXRPC_CLIENT_INITIATED; + conn->bundle = rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_conn); + conn->local = rxrpc_get_local(bundle->local, rxrpc_local_get_client_conn); + conn->peer = rxrpc_get_peer(bundle->peer, rxrpc_peer_get_client_conn); + conn->key = key_get(bundle->key); + conn->security = bundle->security; conn->exclusive = bundle->exclusive; conn->upgrade = bundle->upgrade; conn->orig_service_id = bundle->service_id; conn->security_level = bundle->security_level; - conn->out_clientflag = RXRPC_CLIENT_INITIATED; - conn->state = RXRPC_CONN_CLIENT; + conn->state = RXRPC_CONN_CLIENT_UNSECURED; conn->service_id = conn->orig_service_id; - ret = rxrpc_get_client_connection_id(conn, gfp); - if (ret < 0) - goto error_0; - - ret = rxrpc_init_client_conn_security(conn); - if (ret < 0) - goto error_1; + if (conn->security == &rxrpc_no_security) + conn->state = RXRPC_CONN_CLIENT; atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); - rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_conn); - rxrpc_get_peer(conn->peer, rxrpc_peer_get_client_conn); - rxrpc_get_local(conn->local, rxrpc_local_get_client_conn); - key_get(conn->key); - - trace_rxrpc_conn(conn->debug_id, refcount_read(&conn->ref), - rxrpc_conn_new_client); + rxrpc_see_connection(conn, rxrpc_conn_new_client); atomic_inc(&rxnet->nr_client_conns); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); - _leave(" = %p", conn); return conn; - -error_1: - rxrpc_put_client_connection_id(bundle->local, conn); -error_0: - kfree(conn); - _leave(" = %d", ret); - return ERR_PTR(ret); } /* @@ -249,7 +205,8 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) goto dont_reuse; - if (conn->state != RXRPC_CONN_CLIENT || + if ((conn->state != RXRPC_CONN_CLIENT_UNSECURED && + conn->state != RXRPC_CONN_CLIENT) || conn->proto.epoch != rxnet->epoch) goto mark_dont_reuse; @@ -280,7 +237,7 @@ dont_reuse: * Look up the conn bundle that matches the connection parameters, adding it if * it doesn't yet exist. */ -static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) +int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) { static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle, *candidate; @@ -295,7 +252,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t if (test_bit(RXRPC_CALL_EXCLUSIVE, &call->flags)) { call->bundle = rxrpc_alloc_bundle(call, gfp); - return call->bundle; + return call->bundle ? 0 : -ENOMEM; } /* First, see if the bundle is already there. */ @@ -324,7 +281,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t /* It wasn't. We need to add one. */ candidate = rxrpc_alloc_bundle(call, gfp); if (!candidate) - return ERR_PTR(-ENOMEM); + return -ENOMEM; _debug("search 2"); spin_lock(&local->client_bundles_lock); @@ -355,7 +312,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t call->bundle = rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); spin_unlock(&local->client_bundles_lock); _leave(" = B=%u [new]", call->bundle->debug_id); - return call->bundle; + return 0; found_bundle_free: rxrpc_free_bundle(candidate); @@ -364,160 +321,77 @@ found_bundle: rxrpc_activate_bundle(bundle); spin_unlock(&local->client_bundles_lock); _leave(" = B=%u [found]", call->bundle->debug_id); - return call->bundle; -} - -/* - * Create or find a client bundle to use for a call. - * - * If we return with a connection, the call will be on its waiting list. It's - * left to the caller to assign a channel and wake up the call. - */ -static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_call *call, gfp_t gfp) -{ - struct rxrpc_bundle *bundle; - - _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - - call->peer = rxrpc_lookup_peer(call->local, &call->dest_srx, gfp); - if (!call->peer) - goto error; - - call->tx_last_sent = ktime_get_real(); - call->cong_ssthresh = call->peer->cong_ssthresh; - if (call->cong_cwnd >= call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - else - call->cong_mode = RXRPC_CALL_SLOW_START; - - /* Find the client connection bundle. */ - bundle = rxrpc_look_up_bundle(call, gfp); - if (!bundle) - goto error; - - /* Get this call queued. Someone else may activate it whilst we're - * lining up a new connection, but that's fine. - */ - spin_lock(&bundle->channel_lock); - list_add_tail(&call->chan_wait_link, &bundle->waiting_calls); - spin_unlock(&bundle->channel_lock); - - _leave(" = [B=%x]", bundle->debug_id); - return bundle; - -error: - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); + return 0; } /* * Allocate a new connection and add it into a bundle. */ -static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) - __releases(bundle->channel_lock) +static bool rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, + unsigned int slot) { - struct rxrpc_connection *candidate = NULL, *old = NULL; - bool conflict; - int i; - - _enter(""); - - conflict = bundle->alloc_conn; - if (!conflict) - bundle->alloc_conn = true; - spin_unlock(&bundle->channel_lock); - if (conflict) { - _leave(" [conf]"); - return; - } - - candidate = rxrpc_alloc_client_connection(bundle, gfp); - - spin_lock(&bundle->channel_lock); - bundle->alloc_conn = false; + struct rxrpc_connection *conn, *old; + unsigned int shift = slot * RXRPC_MAXCALLS; + unsigned int i; - if (IS_ERR(candidate)) { - bundle->alloc_error = PTR_ERR(candidate); - spin_unlock(&bundle->channel_lock); - _leave(" [err %ld]", PTR_ERR(candidate)); - return; + old = bundle->conns[slot]; + if (old) { + bundle->conns[slot] = NULL; + trace_rxrpc_client(old, -1, rxrpc_client_replace); + rxrpc_put_connection(old, rxrpc_conn_put_noreuse); } - bundle->alloc_error = 0; - - for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) { - unsigned int shift = i * RXRPC_MAXCALLS; - int j; - - old = bundle->conns[i]; - if (!rxrpc_may_reuse_conn(old)) { - if (old) - trace_rxrpc_client(old, -1, rxrpc_client_replace); - candidate->bundle_shift = shift; - rxrpc_activate_bundle(bundle); - bundle->conns[i] = candidate; - for (j = 0; j < RXRPC_MAXCALLS; j++) - set_bit(shift + j, &bundle->avail_chans); - candidate = NULL; - break; - } - - old = NULL; + conn = rxrpc_alloc_client_connection(bundle); + if (IS_ERR(conn)) { + bundle->alloc_error = PTR_ERR(conn); + return false; } - spin_unlock(&bundle->channel_lock); - - if (candidate) { - _debug("discard C=%x", candidate->debug_id); - trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate); - rxrpc_put_connection(candidate, rxrpc_conn_put_discard); - } - - rxrpc_put_connection(old, rxrpc_conn_put_noreuse); - _leave(""); + rxrpc_activate_bundle(bundle); + conn->bundle_shift = shift; + bundle->conns[slot] = conn; + for (i = 0; i < RXRPC_MAXCALLS; i++) + set_bit(shift + i, &bundle->avail_chans); + return true; } /* * Add a connection to a bundle if there are no usable connections or we have * connections waiting for extra capacity. */ -static void rxrpc_maybe_add_conn(struct rxrpc_bundle *bundle, gfp_t gfp) +static bool rxrpc_bundle_has_space(struct rxrpc_bundle *bundle) { - struct rxrpc_call *call; - int i, usable; + int slot = -1, i, usable; _enter(""); - spin_lock(&bundle->channel_lock); + bundle->alloc_error = 0; /* See if there are any usable connections. */ usable = 0; - for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) + for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) { if (rxrpc_may_reuse_conn(bundle->conns[i])) usable++; - - if (!usable && !list_empty(&bundle->waiting_calls)) { - call = list_first_entry(&bundle->waiting_calls, - struct rxrpc_call, chan_wait_link); - if (test_bit(RXRPC_CALL_UPGRADE, &call->flags)) - bundle->try_upgrade = true; + else if (slot == -1) + slot = i; } + if (!usable && bundle->upgrade) + bundle->try_upgrade = true; + if (!usable) goto alloc_conn; if (!bundle->avail_chans && !bundle->try_upgrade && - !list_empty(&bundle->waiting_calls) && usable < ARRAY_SIZE(bundle->conns)) goto alloc_conn; - spin_unlock(&bundle->channel_lock); _leave(""); - return; + return usable; alloc_conn: - return rxrpc_add_conn_to_bundle(bundle, gfp); + return slot >= 0 ? rxrpc_add_conn_to_bundle(bundle, slot) : false; } /* @@ -531,11 +405,13 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, struct rxrpc_channel *chan = &conn->channels[channel]; struct rxrpc_bundle *bundle = conn->bundle; struct rxrpc_call *call = list_entry(bundle->waiting_calls.next, - struct rxrpc_call, chan_wait_link); + struct rxrpc_call, wait_link); u32 call_id = chan->call_counter + 1; _enter("C=%x,%u", conn->debug_id, channel); + list_del_init(&call->wait_link); + trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); /* Cancel the final ACK on the previous call if it hasn't been sent yet @@ -545,65 +421,50 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, clear_bit(conn->bundle_shift + channel, &bundle->avail_chans); rxrpc_see_call(call, rxrpc_call_see_activate_client); - list_del_init(&call->chan_wait_link); call->conn = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call); call->cid = conn->proto.cid | channel; call->call_id = call_id; call->dest_srx.srx_service = conn->service_id; - - trace_rxrpc_connect_call(call); - - rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_SEND_REQUEST); - - /* Paired with the read barrier in rxrpc_connect_call(). This orders - * cid and epoch in the connection wrt to call_id without the need to - * take the channel_lock. - * - * We provisionally assign a callNumber at this point, but we don't - * confirm it until the call is about to be exposed. - * - * TODO: Pair with a barrier in the data_ready handler when that looks - * at the call ID through a connection channel. - */ - smp_wmb(); + call->cong_ssthresh = call->peer->cong_ssthresh; + if (call->cong_cwnd >= call->cong_ssthresh) + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + else + call->cong_mode = RXRPC_CALL_SLOW_START; chan->call_id = call_id; chan->call_debug_id = call->debug_id; - rcu_assign_pointer(chan->call, call); + chan->call = call; + + rxrpc_see_call(call, rxrpc_call_see_connected); + trace_rxrpc_connect_call(call); + call->tx_last_sent = ktime_get_real(); + rxrpc_start_call_timer(call); + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_SEND_REQUEST); wake_up(&call->waitq); } /* * Remove a connection from the idle list if it's on it. */ -static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connection *conn) +static void rxrpc_unidle_conn(struct rxrpc_connection *conn) { - struct rxrpc_local *local = bundle->local; - bool drop_ref; - if (!list_empty(&conn->cache_link)) { - drop_ref = false; - spin_lock(&local->client_conn_cache_lock); - if (!list_empty(&conn->cache_link)) { - list_del_init(&conn->cache_link); - drop_ref = true; - } - spin_unlock(&local->client_conn_cache_lock); - if (drop_ref) - rxrpc_put_connection(conn, rxrpc_conn_put_unidle); + list_del_init(&conn->cache_link); + rxrpc_put_connection(conn, rxrpc_conn_put_unidle); } } /* - * Assign channels and callNumbers to waiting calls with channel_lock - * held by caller. + * Assign channels and callNumbers to waiting calls. */ -static void rxrpc_activate_channels_locked(struct rxrpc_bundle *bundle) +static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) { struct rxrpc_connection *conn; unsigned long avail, mask; unsigned int channel, slot; + trace_rxrpc_client(NULL, -1, rxrpc_client_activate_chans); + if (bundle->try_upgrade) mask = 1; else @@ -623,7 +484,7 @@ static void rxrpc_activate_channels_locked(struct rxrpc_bundle *bundle) if (bundle->try_upgrade) set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags); - rxrpc_unidle_conn(bundle, conn); + rxrpc_unidle_conn(conn); channel &= (RXRPC_MAXCALLS - 1); conn->act_chans |= 1 << channel; @@ -632,125 +493,24 @@ static void rxrpc_activate_channels_locked(struct rxrpc_bundle *bundle) } /* - * Assign channels and callNumbers to waiting calls. - */ -static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) -{ - _enter("B=%x", bundle->debug_id); - - trace_rxrpc_client(NULL, -1, rxrpc_client_activate_chans); - - if (!bundle->avail_chans) - return; - - spin_lock(&bundle->channel_lock); - rxrpc_activate_channels_locked(bundle); - spin_unlock(&bundle->channel_lock); - _leave(""); -} - -/* - * Wait for a callNumber and a channel to be granted to a call. - */ -static int rxrpc_wait_for_channel(struct rxrpc_bundle *bundle, - struct rxrpc_call *call, gfp_t gfp) -{ - DECLARE_WAITQUEUE(myself, current); - int ret = 0; - - _enter("%d", call->debug_id); - - if (!gfpflags_allow_blocking(gfp)) { - rxrpc_maybe_add_conn(bundle, gfp); - rxrpc_activate_channels(bundle); - ret = bundle->alloc_error ?: -EAGAIN; - goto out; - } - - add_wait_queue_exclusive(&call->waitq, &myself); - for (;;) { - rxrpc_maybe_add_conn(bundle, gfp); - rxrpc_activate_channels(bundle); - ret = bundle->alloc_error; - if (ret < 0) - break; - - switch (call->interruptibility) { - case RXRPC_INTERRUPTIBLE: - case RXRPC_PREINTERRUPTIBLE: - set_current_state(TASK_INTERRUPTIBLE); - break; - case RXRPC_UNINTERRUPTIBLE: - default: - set_current_state(TASK_UNINTERRUPTIBLE); - break; - } - if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) - break; - if ((call->interruptibility == RXRPC_INTERRUPTIBLE || - call->interruptibility == RXRPC_PREINTERRUPTIBLE) && - signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - schedule(); - } - remove_wait_queue(&call->waitq, &myself); - __set_current_state(TASK_RUNNING); - -out: - _leave(" = %d", ret); - return ret; -} - -/* - * find a connection for a call - * - called in process context with IRQs enabled + * Connect waiting channels (called from the I/O thread). */ -int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) +void rxrpc_connect_client_calls(struct rxrpc_local *local) { - struct rxrpc_bundle *bundle; - int ret = 0; - - _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - - rxrpc_get_call(call, rxrpc_call_get_io_thread); - - bundle = rxrpc_prep_call(call, gfp); - if (IS_ERR(bundle)) { - rxrpc_put_call(call, rxrpc_call_get_io_thread); - ret = PTR_ERR(bundle); - goto out; - } - - if (rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_CONN) { - ret = rxrpc_wait_for_channel(bundle, call, gfp); - if (ret < 0) - goto wait_failed; - } - -granted_channel: - /* Paired with the write barrier in rxrpc_activate_one_channel(). */ - smp_rmb(); + struct rxrpc_call *call; -out: - _leave(" = %d", ret); - return ret; + while ((call = list_first_entry_or_null(&local->new_client_calls, + struct rxrpc_call, wait_link)) + ) { + struct rxrpc_bundle *bundle = call->bundle; -wait_failed: - spin_lock(&bundle->channel_lock); - list_del_init(&call->chan_wait_link); - spin_unlock(&bundle->channel_lock); + spin_lock(&local->client_call_lock); + list_move_tail(&call->wait_link, &bundle->waiting_calls); + spin_unlock(&local->client_call_lock); - if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) { - ret = 0; - goto granted_channel; + if (rxrpc_bundle_has_space(bundle)) + rxrpc_activate_channels(bundle); } - - trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed); - rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); - rxrpc_disconnect_client_call(bundle, call); - goto out; } /* @@ -808,8 +568,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call _enter("c=%x", call->debug_id); - spin_lock(&bundle->channel_lock); - /* Calls that have never actually been assigned a channel can simply be * discarded. */ @@ -818,8 +576,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call _debug("call is waiting"); ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); - list_del_init(&call->chan_wait_link); - goto out; + list_del_init(&call->wait_link); + return; } cid = call->cid; @@ -827,10 +585,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call chan = &conn->channels[channel]; trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); - if (rcu_access_pointer(chan->call) != call) { - spin_unlock(&bundle->channel_lock); - BUG(); - } + if (WARN_ON(chan->call != call)) + return; may_reuse = rxrpc_may_reuse_conn(conn); @@ -851,16 +607,15 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call trace_rxrpc_client(conn, channel, rxrpc_client_to_active); bundle->try_upgrade = false; if (may_reuse) - rxrpc_activate_channels_locked(bundle); + rxrpc_activate_channels(bundle); } - } /* See if we can pass the channel directly to another call. */ if (may_reuse && !list_empty(&bundle->waiting_calls)) { trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); rxrpc_activate_one_channel(conn, channel); - goto out; + return; } /* Schedule the final ACK to be transmitted in a short while so that it @@ -878,7 +633,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call } /* Deactivate the channel. */ - rcu_assign_pointer(chan->call, NULL); + chan->call = NULL; set_bit(conn->bundle_shift + channel, &conn->bundle->avail_chans); conn->act_chans &= ~(1 << channel); @@ -891,15 +646,10 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call conn->idle_timestamp = jiffies; rxrpc_get_connection(conn, rxrpc_conn_get_idle); - spin_lock(&local->client_conn_cache_lock); list_move_tail(&conn->cache_link, &local->idle_client_conns); - spin_unlock(&local->client_conn_cache_lock); rxrpc_set_client_reap_timer(local); } - -out: - spin_unlock(&bundle->channel_lock); } /* @@ -909,7 +659,6 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) { struct rxrpc_bundle *bundle = conn->bundle; unsigned int bindex; - bool need_drop = false; int i; _enter("C=%x", conn->debug_id); @@ -917,18 +666,13 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, true); - spin_lock(&bundle->channel_lock); bindex = conn->bundle_shift / RXRPC_MAXCALLS; if (bundle->conns[bindex] == conn) { _debug("clear slot %u", bindex); bundle->conns[bindex] = NULL; for (i = 0; i < RXRPC_MAXCALLS; i++) clear_bit(conn->bundle_shift + i, &bundle->avail_chans); - need_drop = true; - } - spin_unlock(&bundle->channel_lock); - - if (need_drop) { + rxrpc_put_client_connection_id(bundle->local, conn); rxrpc_deactivate_bundle(bundle); rxrpc_put_connection(conn, rxrpc_conn_put_unbundle); } @@ -990,24 +734,16 @@ void rxrpc_discard_expired_client_conns(struct rxrpc_local *local) _enter(""); - if (list_empty(&local->idle_client_conns)) { - _leave(" [empty]"); - return; - } - /* We keep an estimate of what the number of conns ought to be after * we've discarded some so that we don't overdo the discarding. */ nr_conns = atomic_read(&local->rxnet->nr_client_conns); next: - spin_lock(&local->client_conn_cache_lock); - - if (list_empty(&local->idle_client_conns)) - goto out; - - conn = list_entry(local->idle_client_conns.next, - struct rxrpc_connection, cache_link); + conn = list_first_entry_or_null(&local->idle_client_conns, + struct rxrpc_connection, cache_link); + if (!conn) + return; if (!local->kill_all_client_conns) { /* If the number of connections is over the reap limit, we @@ -1032,8 +768,6 @@ next: trace_rxrpc_client(conn, -1, rxrpc_client_discard); list_del_init(&conn->cache_link); - spin_unlock(&local->client_conn_cache_lock); - rxrpc_unbundle_conn(conn); /* Drop the ->cache_link ref */ rxrpc_put_connection(conn, rxrpc_conn_put_discard_idle); @@ -1053,8 +787,6 @@ not_yet_expired: if (!local->kill_all_client_conns) timer_reduce(&local->client_conn_reap_timer, conn_expires_at); -out: - spin_unlock(&local->client_conn_cache_lock); _leave(""); } @@ -1063,34 +795,19 @@ out: */ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) { - struct rxrpc_connection *conn, *tmp; - LIST_HEAD(graveyard); + struct rxrpc_connection *conn; _enter(""); - spin_lock(&local->client_conn_cache_lock); local->kill_all_client_conns = true; - spin_unlock(&local->client_conn_cache_lock); del_timer_sync(&local->client_conn_reap_timer); - spin_lock(&local->client_conn_cache_lock); - - list_for_each_entry_safe(conn, tmp, &local->idle_client_conns, - cache_link) { - if (conn->local == local) { - atomic_dec(&conn->active); - trace_rxrpc_client(conn, -1, rxrpc_client_discard); - list_move(&conn->cache_link, &graveyard); - } - } - - spin_unlock(&local->client_conn_cache_lock); - - while (!list_empty(&graveyard)) { - conn = list_entry(graveyard.next, - struct rxrpc_connection, cache_link); + while ((conn = list_first_entry_or_null(&local->idle_client_conns, + struct rxrpc_connection, cache_link))) { list_del_init(&conn->cache_link); + atomic_dec(&conn->active); + trace_rxrpc_client(conn, -1, rxrpc_client_discard); rxrpc_unbundle_conn(conn); rxrpc_put_connection(conn, rxrpc_conn_put_local_dead); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 8d0b9ff0a5e1..44414e724415 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -100,9 +100,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, /* If the last call got moved on whilst we were waiting to run, just * ignore this packet. */ - call_id = READ_ONCE(chan->last_call); - /* Sync with __rxrpc_disconnect_call() */ - smp_rmb(); + call_id = chan->last_call; if (skb && call_id != sp->hdr.callNumber) return; @@ -119,9 +117,12 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, iov[2].iov_base = &ack_info; iov[2].iov_len = sizeof(ack_info); + serial = atomic_inc_return(&conn->serial); + pkt.whdr.epoch = htonl(conn->proto.epoch); pkt.whdr.cid = htonl(conn->proto.cid | channel); pkt.whdr.callNumber = htonl(call_id); + pkt.whdr.serial = htonl(serial); pkt.whdr.seq = 0; pkt.whdr.type = chan->last_type; pkt.whdr.flags = conn->out_clientflag; @@ -158,31 +159,15 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, iov[0].iov_len += sizeof(pkt.ack); len += sizeof(pkt.ack) + 3 + sizeof(ack_info); ioc = 3; - break; - - default: - return; - } - - /* Resync with __rxrpc_disconnect_call() and check that the last call - * didn't get advanced whilst we were filling out the packets. - */ - smp_rmb(); - if (READ_ONCE(chan->last_call) != call_id) - return; - - serial = atomic_inc_return(&conn->serial); - pkt.whdr.serial = htonl(serial); - switch (chan->last_type) { - case RXRPC_PACKET_TYPE_ABORT: - break; - case RXRPC_PACKET_TYPE_ACK: trace_rxrpc_tx_ack(chan->call_debug_id, serial, ntohl(pkt.ack.firstPacket), ntohl(pkt.ack.serial), pkt.ack.reason, 0); break; + + default: + return; } ret = kernel_sendmsg(conn->local->socket, &msg, iov, ioc, len); @@ -207,12 +192,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) _enter("{%d},%x", conn->debug_id, conn->abort_code); - spin_lock(&conn->bundle->channel_lock); - for (i = 0; i < RXRPC_MAXCALLS; i++) { - call = rcu_dereference_protected( - conn->channels[i].call, - lockdep_is_held(&conn->bundle->channel_lock)); + call = conn->channels[i].call; if (call) rxrpc_set_call_completion(call, conn->completion, @@ -220,7 +201,6 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) conn->error); } - spin_unlock(&conn->bundle->channel_lock); _leave(""); } @@ -316,9 +296,7 @@ again: if (!test_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags)) continue; - smp_rmb(); /* vs rxrpc_disconnect_client_call */ - ack_at = READ_ONCE(chan->final_ack_at); - + ack_at = chan->final_ack_at; if (time_before(j, ack_at) && !force) { if (time_before(ack_at, next_j)) { next_j = ack_at; @@ -446,15 +424,8 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (conn->state != RXRPC_CONN_SERVICE) break; - spin_lock(&conn->bundle->channel_lock); - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure( - rcu_dereference_protected( - conn->channels[loop].call, - lockdep_is_held(&conn->bundle->channel_lock))); - - spin_unlock(&conn->bundle->channel_lock); + rxrpc_call_is_secure(conn->channels[loop].call); break; } diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 3d8c1dc6a82a..ac85d4644a3c 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -67,6 +67,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); + mutex_init(&conn->security_lock); skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; @@ -157,7 +158,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, _enter("%d,%x", conn->debug_id, call->cid); - if (rcu_access_pointer(chan->call) == call) { + if (chan->call == call) { /* Save the result of the call so that we can repeat it if necessary * through the channel, whilst disposing of the actual call record. */ @@ -177,12 +178,9 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, break; } - /* Sync with rxrpc_conn_retransmit(). */ - smp_wmb(); chan->last_call = chan->call_id; chan->call_id = chan->call_counter; - - rcu_assign_pointer(chan->call, NULL); + chan->call = NULL; } _leave(""); @@ -210,10 +208,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) if (rxrpc_is_client_call(call)) { rxrpc_disconnect_client_call(call->bundle, call); } else { - spin_lock(&conn->bundle->channel_lock); __rxrpc_disconnect_call(conn, call); - spin_unlock(&conn->bundle->channel_lock); - conn->idle_timestamp = jiffies; if (atomic_dec_and_test(&conn->active)) rxrpc_set_service_reap_timer(conn->rxnet, @@ -316,10 +311,10 @@ static void rxrpc_clean_up_connection(struct work_struct *work) container_of(work, struct rxrpc_connection, destructor); struct rxrpc_net *rxnet = conn->rxnet; - ASSERT(!rcu_access_pointer(conn->channels[0].call) && - !rcu_access_pointer(conn->channels[1].call) && - !rcu_access_pointer(conn->channels[2].call) && - !rcu_access_pointer(conn->channels[3].call)); + ASSERT(!conn->channels[0].call && + !conn->channels[1].call && + !conn->channels[2].call && + !conn->channels[3].call); ASSERT(list_empty(&conn->cache_link)); del_timer_sync(&conn->timer); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 2a55a88b2a5b..f30323de82bd 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -11,7 +11,6 @@ static struct rxrpc_bundle rxrpc_service_dummy_bundle = { .ref = REFCOUNT_INIT(1), .debug_id = UINT_MAX, - .channel_lock = __SPIN_LOCK_UNLOCKED(&rxrpc_service_dummy_bundle.channel_lock), }; /* diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index a299cc34c140..9e9dfb2fc559 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -369,10 +369,7 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, return just_discard; } - rcu_read_lock(); - call = rxrpc_try_get_call(rcu_dereference(chan->call), - rxrpc_call_get_input); - rcu_read_unlock(); + call = rxrpc_try_get_call(chan->call, rxrpc_call_get_input); if (sp->hdr.callNumber > chan->call_id) { if (rxrpc_to_client(sp)) { @@ -453,6 +450,9 @@ int rxrpc_io_thread(void *data) continue; } + if (!list_empty(&local->new_client_calls)) + rxrpc_connect_client_calls(local); + /* Process received packets and errors. */ if ((skb = __skb_dequeue(&rx_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -492,7 +492,10 @@ int rxrpc_io_thread(void *data) should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || !list_empty(&local->call_attend_q) || - !list_empty(&local->conn_attend_q)) { + !list_empty(&local->conn_attend_q) || + !list_empty(&local->new_client_calls) || + test_bit(RXRPC_CLIENT_CONN_REAP_TIMER, + &local->client_conn_flags)) { __set_current_state(TASK_RUNNING); continue; } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 9bc8d08ca12c..b8eaca5d9f22 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -117,7 +117,6 @@ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); local->kill_all_client_conns = false; - spin_lock_init(&local->client_conn_cache_lock); INIT_LIST_HEAD(&local->idle_client_conns); timer_setup(&local->client_conn_reap_timer, rxrpc_client_conn_reap_timeout, 0); @@ -133,7 +132,8 @@ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, if (tmp == 0) tmp = 1; idr_set_cursor(&local->conn_ids, tmp); - spin_lock_init(&local->conn_lock); + INIT_LIST_HEAD(&local->new_client_calls); + spin_lock_init(&local->client_call_lock); trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, 1); } @@ -435,7 +435,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local) * local endpoint. */ rxrpc_purge_queue(&local->rx_queue); - rxrpc_destroy_client_conn_ids(local); + rxrpc_purge_client_connections(local); } /* diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index c39ef94602ed..750158a085cd 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -12,6 +12,7 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { [RXRPC_CONN_UNUSED] = "Unused ", + [RXRPC_CONN_CLIENT_UNSECURED] = "ClUnsec ", [RXRPC_CONN_CLIENT] = "Client ", [RXRPC_CONN_SERVICE_PREALLOC] = "SvPrealc", [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ", diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index dfb01e7b90fb..1bf571a66e02 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -1122,36 +1122,31 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, goto protocol_error_free; } - spin_lock(&conn->bundle->channel_lock); for (i = 0; i < RXRPC_MAXCALLS; i++) { - struct rxrpc_call *call; u32 call_id = ntohl(response->encrypted.call_id[i]); + u32 counter = READ_ONCE(conn->channels[i].call_counter); if (call_id > INT_MAX) { rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, rxkad_abort_resp_bad_callid); - goto protocol_error_unlock; + goto protocol_error_free; } - if (call_id < conn->channels[i].call_counter) { + if (call_id < counter) { rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, rxkad_abort_resp_call_ctr); - goto protocol_error_unlock; + goto protocol_error_free; } - if (call_id > conn->channels[i].call_counter) { - call = rcu_dereference_protected( - conn->channels[i].call, - lockdep_is_held(&conn->bundle->channel_lock)); - if (call && !__rxrpc_call_is_complete(call)) { + if (call_id > counter) { + if (conn->channels[i].call) { rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, rxkad_abort_resp_call_state); - goto protocol_error_unlock; + goto protocol_error_free; } conn->channels[i].call_counter = call_id; } } - spin_unlock(&conn->bundle->channel_lock); if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1) { rxrpc_abort_conn(conn, skb, RXKADOUTOFSEQUENCE, -EPROTO, @@ -1179,8 +1174,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _leave(" = 0"); return 0; -protocol_error_unlock: - spin_unlock(&conn->bundle->channel_lock); protocol_error_free: kfree(ticket); protocol_error: diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 78af14694618..cd66634dffe6 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -97,38 +97,31 @@ found: */ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) { - const struct rxrpc_security *sec; struct rxrpc_key_token *token; struct key *key = conn->key; - int ret; + int ret = 0; _enter("{%d},{%x}", conn->debug_id, key_serial(key)); - if (!key) - return 0; - - ret = key_validate(key); - if (ret < 0) - return ret; - for (token = key->payload.data[0]; token; token = token->next) { - sec = rxrpc_security_lookup(token->security_index); - if (sec) + if (token->security_index == conn->security->security_index) goto found; } return -EKEYREJECTED; found: - conn->security = sec; - - ret = conn->security->init_connection_security(conn, token); - if (ret < 0) { - conn->security = &rxrpc_no_security; - return ret; + mutex_lock(&conn->security_lock); + if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) { + ret = conn->security->init_connection_security(conn, token); + if (ret == 0) { + spin_lock(&conn->state_lock); + if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) + conn->state = RXRPC_CONN_CLIENT; + spin_unlock(&conn->state_lock); + } } - - _leave(" = 0"); - return 0; + mutex_unlock(&conn->security_lock); + return ret; } /* diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index a5d0005b7ce5..da49fcf1c456 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -38,6 +38,60 @@ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, return false; } +/* + * Wait for a call to become connected. Interruption here doesn't cause the + * call to be aborted. + */ +static int rxrpc_wait_to_be_connected(struct rxrpc_call *call, long *timeo) +{ + DECLARE_WAITQUEUE(myself, current); + int ret = 0; + + _enter("%d", call->debug_id); + + if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) + return call->error; + + add_wait_queue_exclusive(&call->waitq, &myself); + + for (;;) { + ret = call->error; + if (ret < 0) + break; + + switch (call->interruptibility) { + case RXRPC_INTERRUPTIBLE: + case RXRPC_PREINTERRUPTIBLE: + set_current_state(TASK_INTERRUPTIBLE); + break; + case RXRPC_UNINTERRUPTIBLE: + default: + set_current_state(TASK_UNINTERRUPTIBLE); + break; + } + if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) { + ret = call->error; + break; + } + if ((call->interruptibility == RXRPC_INTERRUPTIBLE || + call->interruptibility == RXRPC_PREINTERRUPTIBLE) && + signal_pending(current)) { + ret = sock_intr_errno(*timeo); + break; + } + *timeo = schedule_timeout(*timeo); + } + + remove_wait_queue(&call->waitq, &myself); + __set_current_state(TASK_RUNNING); + + if (ret == 0 && rxrpc_call_is_complete(call)) + ret = call->error; + + _leave(" = %d", ret); + return ret; +} + /* * Return true if there's sufficient Tx queue space. */ @@ -239,6 +293,16 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + ret = rxrpc_wait_to_be_connected(call, &timeo); + if (ret < 0) + return ret; + + if (call->conn->state == RXRPC_CONN_CLIENT_UNSECURED) { + ret = rxrpc_init_client_conn_security(call->conn); + if (ret < 0) + return ret; + } + /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); -- cgit v1.2.3 From c244c092f1ed2acfb5af3d3da81e22367d3dd733 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 5 Jan 2023 06:02:51 +0000 Subject: tipc: fix unexpected link reset due to discovery messages This unexpected behavior is observed: node 1 | node 2 ------ | ------ link is established | link is established reboot | link is reset up | send discovery message receive discovery message | link is established | link is established send discovery message | | receive discovery message | link is reset (unexpected) | send reset message link is reset | It is due to delayed re-discovery as described in function tipc_node_check_dest(): "this link endpoint has already reset and re-established contact with the peer, before receiving a discovery message from that node." However, commit 598411d70f85 has changed the condition for calling tipc_node_link_down() which was the acceptance of new media address. This commit fixes this by restoring the old and correct behavior. Fixes: 598411d70f85 ("tipc: make resetting of links non-atomic") Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/node.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 49ddc484c4fe..5e000fde8067 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1179,8 +1179,9 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool addr_match = false; bool sign_match = false; bool link_up = false; + bool link_is_reset = false; bool accept_addr = false; - bool reset = true; + bool reset = false; char *if_name; unsigned long intv; u16 session; @@ -1200,14 +1201,14 @@ void tipc_node_check_dest(struct net *net, u32 addr, /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); + link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { - /* All is fine. Do nothing. */ - reset = false; + /* All is fine. Ignore requests. */ /* Peer node is not a container/local namespace */ if (!n->peer_hash_mix) n->peer_hash_mix = hash_mixes; @@ -1232,6 +1233,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, */ accept_addr = true; *respond = true; + reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already @@ -1263,6 +1265,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, n->signature = signature; accept_addr = true; *respond = true; + reset = true; } if (!accept_addr) @@ -1291,6 +1294,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); + link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); @@ -1303,7 +1307,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); - if (reset && l && !tipc_link_is_reset(l)) + if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } -- cgit v1.2.3 From 7827c81f0248e3c2f40d438b020f3d222f002171 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Jan 2023 12:43:37 -0500 Subject: Revert "SUNRPC: Use RMW bitops in single-threaded hot paths" The premise that "Once an svc thread is scheduled and executing an RPC, no other processes will touch svc_rqst::rq_flags" is false. svc_xprt_enqueue() examines the RQ_BUSY flag in scheduled nfsd threads when determining which thread to wake up next. Found via KCSAN. Fixes: 28df0988815f ("SUNRPC: Use RMW bitops in single-threaded hot paths") Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/svcauth_gss.c | 4 ++-- net/sunrpc/svc.c | 6 +++--- net/sunrpc/svc_xprt.c | 2 +- net/sunrpc/svcsock.c | 8 ++++---- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 148bb0a7fa5b..acb822b23af1 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -923,7 +923,7 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g * rejecting the server-computed MIC in this somewhat rare case, * do not use splice with the GSS integrity service. */ - __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) @@ -990,7 +990,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs int pad, remaining_len, offset; u32 rseqno; - __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); priv_len = svc_getnl(&buf->head[0]); if (rqstp->rq_deferred) { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 8f1b596db33f..b1da586c4476 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1243,10 +1243,10 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto err_short_len; /* Will be turned off by GSS integrity and privacy services */ - __set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Will be turned off only when NFSv4 Sessions are used */ - __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); - __clear_bit(RQ_DROPME, &rqstp->rq_flags); + set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + clear_bit(RQ_DROPME, &rqstp->rq_flags); svc_putu32(resv, rqstp->rq_xid); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 2106003645a7..c2ce12538008 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1238,7 +1238,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) trace_svc_defer(rqstp); svc_xprt_get(rqstp->rq_xprt); dr->xprt = rqstp->rq_xprt; - __set_bit(RQ_DROPME, &rqstp->rq_flags); + set_bit(RQ_DROPME, &rqstp->rq_flags); dr->handle.revisit = svc_revisit; return &dr->handle; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2fc98fea59b4..e833103f4629 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -298,9 +298,9 @@ static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs) static void svc_sock_secure_port(struct svc_rqst *rqstp) { if (svc_port_is_privileged(svc_addr(rqstp))) - __set_bit(RQ_SECURE, &rqstp->rq_flags); + set_bit(RQ_SECURE, &rqstp->rq_flags); else - __clear_bit(RQ_SECURE, &rqstp->rq_flags); + clear_bit(RQ_SECURE, &rqstp->rq_flags); } /* @@ -1008,9 +1008,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags)) - __set_bit(RQ_LOCAL, &rqstp->rq_flags); + set_bit(RQ_LOCAL, &rqstp->rq_flags); else - __clear_bit(RQ_LOCAL, &rqstp->rq_flags); + clear_bit(RQ_LOCAL, &rqstp->rq_flags); p = (__be32 *)rqstp->rq_arg.head[0].iov_base; calldir = p[1]; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 199fa012f18a..94b20fb47135 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -602,7 +602,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) static void svc_rdma_secure_port(struct svc_rqst *rqstp) { - __set_bit(RQ_SECURE, &rqstp->rq_flags); + set_bit(RQ_SECURE, &rqstp->rq_flags); } static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) -- cgit v1.2.3 From 42f229c350f57a8e825f7591e17cbc5c87e50235 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Jan 2023 13:03:18 +0000 Subject: rxrpc: Fix incoming call setup race An incoming call can race with rxrpc socket destruction, leading to a leaked call. This may result in an oops when the call timer eventually expires: BUG: kernel NULL pointer dereference, address: 0000000000000874 RIP: 0010:_raw_spin_lock_irqsave+0x2a/0x50 Call Trace: try_to_wake_up+0x59/0x550 ? __local_bh_enable_ip+0x37/0x80 ? rxrpc_poke_call+0x52/0x110 [rxrpc] ? rxrpc_poke_call+0x110/0x110 [rxrpc] ? rxrpc_poke_call+0x110/0x110 [rxrpc] call_timer_fn+0x24/0x120 with a warning in the kernel log looking something like: rxrpc: Call 00000000ba5e571a still in use (1,SvAwtACK,1061d,0)! incurred during rmmod of rxrpc. The 1061d is the call flags: RECVMSG_READ_ALL, RX_HEARD, BEGAN_RX_TIMER, RX_LAST, EXPOSED, IS_SERVICE, RELEASED but no DISCONNECTED flag (0x800), so it's an incoming (service) call and it's still connected. The race appears to be that: (1) rxrpc_new_incoming_call() consults the service struct, checks sk_state and allocates a call - then pauses, possibly for an interrupt. (2) rxrpc_release_sock() sets RXRPC_CLOSE, nulls the service pointer, discards the prealloc and releases all calls attached to the socket. (3) rxrpc_new_incoming_call() resumes, launching the new call, including its timer and attaching it to the socket. Fix this by read-locking local->services_lock to access the AF_RXRPC socket providing the service rather than RCU in rxrpc_new_incoming_call(). There's no real need to use RCU here as local->services_lock is only write-locked by the socket side in two places: when binding and when shutting down. Fixes: 5e6ef4f1017c ("rxrpc: Make the I/O thread take over the call and local processor work") Reported-by: Marc Dionne Signed-off-by: David Howells cc: linux-afs@lists.infradead.org --- net/rxrpc/af_rxrpc.c | 8 ++++---- net/rxrpc/ar-internal.h | 2 +- net/rxrpc/call_accept.c | 14 +++++++------- net/rxrpc/security.c | 6 +++--- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index cf200e4e0eae..ebbd4a1c3f86 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -155,10 +155,10 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) if (service_id) { write_lock(&local->services_lock); - if (rcu_access_pointer(local->service)) + if (local->service) goto service_in_use; rx->local = local; - rcu_assign_pointer(local->service, rx); + local->service = rx; write_unlock(&local->services_lock); rx->sk.sk_state = RXRPC_SERVER_BOUND; @@ -875,9 +875,9 @@ static int rxrpc_release_sock(struct sock *sk) sk->sk_state = RXRPC_CLOSE; - if (rx->local && rcu_access_pointer(rx->local->service) == rx) { + if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); - rcu_assign_pointer(rx->local->service, NULL); + rx->local->service = NULL; write_unlock(&rx->local->services_lock); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 007258538bee..433060cade03 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -283,7 +283,7 @@ struct rxrpc_local { struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; struct completion io_thread_ready; /* Indication that the I/O thread started */ - struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ + struct rxrpc_sock *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head rx_queue; /* Received packets */ struct list_head conn_attend_q; /* Conns requiring immediate attention */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 3fbf2fcaaf9e..3e8689fdc437 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -343,13 +343,13 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); - rcu_read_lock(); + read_lock(&local->services_lock); /* Weed out packets to services we're not offering. Packets that would * begin a call are explicitly rejected and the rest are just * discarded. */ - rx = rcu_dereference(local->service); + rx = local->service; if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && sp->hdr.serviceId != rx->second_service) ) { @@ -399,7 +399,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, spin_unlock(&conn->state_lock); spin_unlock(&rx->incoming_lock); - rcu_read_unlock(); + read_unlock(&local->services_lock); if (hlist_unhashed(&call->error_link)) { spin_lock(&call->peer->lock); @@ -413,20 +413,20 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, return true; unsupported_service: - rcu_read_unlock(); + read_unlock(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EOPNOTSUPP); unsupported_security: - rcu_read_unlock(); + read_unlock(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); - rcu_read_unlock(); + read_unlock(&local->services_lock); _leave(" = f [%u]", skb->mark); return false; discard: - rcu_read_unlock(); + read_unlock(&local->services_lock); return true; } diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index cd66634dffe6..cb8dd1d3b1d4 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -178,9 +178,9 @@ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn, sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex); - rcu_read_lock(); + read_lock(&conn->local->services_lock); - rx = rcu_dereference(conn->local->service); + rx = conn->local->service; if (!rx) goto out; @@ -202,6 +202,6 @@ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn, } out: - rcu_read_unlock(); + read_unlock(&conn->local->services_lock); return key; } -- cgit v1.2.3 From 7871f54e3deed68a27111dda162c4fe9b9c65f8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Jan 2023 14:25:23 +0000 Subject: gro: take care of DODGY packets Jaroslav reported a recent throughput regression with virtio_net caused by blamed commit. It is unclear if DODGY GSO packets coming from user space can be accepted by GRO engine in the future with minimal changes, and if there is any expected gain from it. In the meantime, make sure to detect and flush DODGY packets. Fixes: 5eddb24901ee ("gro: add support of (hw)gro packets to gro stack") Signed-off-by: Eric Dumazet Reported-and-bisected-by: Jaroslav Pulchart Cc: Coco Li Cc: Paolo Abeni Signed-off-by: David S. Miller --- net/core/gro.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/gro.c b/net/core/gro.c index fd8c6a7e8d3e..506f83d715f8 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -505,8 +505,9 @@ found_ptype: NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; - /* Only support TCP at the moment. */ - if (!skb_is_gso_tcp(skb)) + /* Only support TCP and non DODGY users. */ + if (!skb_is_gso_tcp(skb) || + (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) NAPI_GRO_CB(skb)->flush = 1; } -- cgit v1.2.3 From 9e17f99220d111ea031b44153fdfe364b0024ff2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 7 Jan 2023 19:10:04 +0200 Subject: net/sched: act_mpls: Fix warning during failed attribute validation The 'TCA_MPLS_LABEL' attribute is of 'NLA_U32' type, but has a validation type of 'NLA_VALIDATE_FUNCTION'. This is an invalid combination according to the comment above 'struct nla_policy': " Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: NLA_BINARY Validation function called for the attribute. All other Unused - but note that it's a union " This can trigger the warning [1] in nla_get_range_unsigned() when validation of the attribute fails. Despite being of 'NLA_U32' type, the associated 'min'/'max' fields in the policy are negative as they are aliased by the 'validate' field. Fix by changing the attribute type to 'NLA_BINARY' which is consistent with the above comment and all other users of NLA_POLICY_VALIDATE_FN(). As a result, move the length validation to the validation function. No regressions in MPLS tests: # ./tdc.py -f tc-tests/actions/mpls.json [...] # echo $? 0 [1] WARNING: CPU: 0 PID: 17743 at lib/nlattr.c:118 nla_get_range_unsigned+0x1d8/0x1e0 lib/nlattr.c:117 Modules linked in: CPU: 0 PID: 17743 Comm: syz-executor.0 Not tainted 6.1.0-rc8 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014 RIP: 0010:nla_get_range_unsigned+0x1d8/0x1e0 lib/nlattr.c:117 [...] Call Trace: __netlink_policy_dump_write_attr+0x23d/0x990 net/netlink/policy.c:310 netlink_policy_dump_write_attr+0x22/0x30 net/netlink/policy.c:411 netlink_ack_tlv_fill net/netlink/af_netlink.c:2454 [inline] netlink_ack+0x546/0x760 net/netlink/af_netlink.c:2506 netlink_rcv_skb+0x1b7/0x240 net/netlink/af_netlink.c:2546 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:6109 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x5e9/0x6b0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x739/0x860 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg net/socket.c:734 [inline] ____sys_sendmsg+0x38f/0x500 net/socket.c:2482 ___sys_sendmsg net/socket.c:2536 [inline] __sys_sendmsg+0x197/0x230 net/socket.c:2565 __do_sys_sendmsg net/socket.c:2574 [inline] __se_sys_sendmsg net/socket.c:2572 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2572 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Link: https://lore.kernel.org/netdev/CAO4mrfdmjvRUNbDyP0R03_DrD_eFCLCguz6OxZ2TYRSv0K9gxA@mail.gmail.com/ Fixes: 2a2ea50870ba ("net: sched: add mpls manipulation actions to TC") Reported-by: Wei Chen Tested-by: Wei Chen Signed-off-by: Ido Schimmel Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20230107171004.608436-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/sched/act_mpls.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index ff47ce4d3968..6b26bdb999d7 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -134,6 +134,11 @@ static int valid_label(const struct nlattr *attr, { const u32 *label = nla_data(attr); + if (nla_len(attr) != sizeof(*label)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid MPLS label length"); + return -EINVAL; + } + if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) { NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range"); return -EINVAL; @@ -145,7 +150,8 @@ static int valid_label(const struct nlattr *attr, static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), [TCA_MPLS_PROTO] = { .type = NLA_U16 }, - [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label), + [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + valid_label), [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7), [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1), [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), -- cgit v1.2.3 From 96398560f26aa07e8f2969d73c8197e6a6d10407 Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Mon, 9 Jan 2023 10:39:06 -0600 Subject: net: sched: disallow noqueue for qdisc classes While experimenting with applying noqueue to a classful queue discipline, we discovered a NULL pointer dereference in the __dev_queue_xmit() path that generates a kernel OOPS: # dev=enp0s5 # tc qdisc replace dev $dev root handle 1: htb default 1 # tc class add dev $dev parent 1: classid 1:1 htb rate 10mbit # tc qdisc add dev $dev parent 1:1 handle 10: noqueue # ping -I $dev -w 1 -c 1 1.1.1.1 [ 2.172856] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 2.173217] #PF: supervisor instruction fetch in kernel mode ... [ 2.178451] Call Trace: [ 2.178577] [ 2.178686] htb_enqueue+0x1c8/0x370 [ 2.178880] dev_qdisc_enqueue+0x15/0x90 [ 2.179093] __dev_queue_xmit+0x798/0xd00 [ 2.179305] ? _raw_write_lock_bh+0xe/0x30 [ 2.179522] ? __local_bh_enable_ip+0x32/0x70 [ 2.179759] ? ___neigh_create+0x610/0x840 [ 2.179968] ? eth_header+0x21/0xc0 [ 2.180144] ip_finish_output2+0x15e/0x4f0 [ 2.180348] ? dst_output+0x30/0x30 [ 2.180525] ip_push_pending_frames+0x9d/0xb0 [ 2.180739] raw_sendmsg+0x601/0xcb0 [ 2.180916] ? _raw_spin_trylock+0xe/0x50 [ 2.181112] ? _raw_spin_unlock_irqrestore+0x16/0x30 [ 2.181354] ? get_page_from_freelist+0xcd6/0xdf0 [ 2.181594] ? sock_sendmsg+0x56/0x60 [ 2.181781] sock_sendmsg+0x56/0x60 [ 2.181958] __sys_sendto+0xf7/0x160 [ 2.182139] ? handle_mm_fault+0x6e/0x1d0 [ 2.182366] ? do_user_addr_fault+0x1e1/0x660 [ 2.182627] __x64_sys_sendto+0x1b/0x30 [ 2.182881] do_syscall_64+0x38/0x90 [ 2.183085] entry_SYSCALL_64_after_hwframe+0x63/0xcd ... [ 2.187402] Previously in commit d66d6c3152e8 ("net: sched: register noqueue qdisc"), NULL was set for the noqueue discipline on noqueue init so that __dev_queue_xmit() falls through for the noqueue case. This also sets a bypass of the enqueue NULL check in the register_qdisc() function for the struct noqueue_disc_ops. Classful queue disciplines make it past the NULL check in __dev_queue_xmit() because the discipline is set to htb (in this case), and then in the call to __dev_xmit_skb(), it calls into htb_enqueue() which grabs a leaf node for a class and then calls qdisc_enqueue() by passing in a queue discipline which assumes ->enqueue() is not set to NULL. Fix this by not allowing classes to be assigned to the noqueue discipline. Linux TC Notes states that classes cannot be set to the noqueue discipline. [1] Let's enforce that here. Links: 1. https://linux-tc-notes.sourceforge.net/tc/doc/sch_noqueue.txt Fixes: d66d6c3152e8 ("net: sched: register noqueue qdisc") Cc: stable@vger.kernel.org Signed-off-by: Frederick Lawler Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20230109163906.706000-1-fred@cloudflare.com Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2317db02c764..72d2c204d5f3 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1133,6 +1133,11 @@ skip: return -ENOENT; } + if (new && new->ops == &noqueue_qdisc_ops) { + NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); + return -EINVAL; + } + err = cops->graft(parent, cl, new, &old, extack); if (err) return err; -- cgit v1.2.3 From cb3e9864cdbe35ff6378966660edbcbac955fe17 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Jan 2023 08:59:06 +0800 Subject: ipv6: raw: Deduct extension header length in rawv6_push_pending_frames The total cork length created by ip6_append_data includes extension headers, so we must exclude them when comparing them against the IPV6_CHECKSUM offset which does not include extension headers. Reported-by: Kyle Zeng Fixes: 357b40a18b04 ("[IPV6]: IPV6_CHECKSUM socket option can corrupt kernel memory") Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/raw.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a06a9f847db5..ada087b50541 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -505,6 +505,7 @@ csum_copy_err: static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct raw6_sock *rp) { + struct ipv6_txoptions *opt; struct sk_buff *skb; int err = 0; int offset; @@ -522,6 +523,9 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, offset = rp->offset; total_len = inet_sk(sk)->cork.base.length; + opt = inet6_sk(sk)->cork.opt; + total_len -= opt ? opt->opt_flen : 0; + if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); -- cgit v1.2.3