From 6f18dc893981e4daab29221d6a9771f3ce2dd8c5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 12 Nov 2015 09:44:33 -0500 Subject: svcrdma: Do not send XDR roundup bytes for a write chunk Minor optimization: when dealing with write chunk XDR roundup, do not post a Write WR for the zero bytes in the pad. Simply update the write segment in the RPC-over-RDMA header to reflect the extra pad bytes. The Reply chunk is also a write chunk, but the server does not use send_write_chunks() to send the Reply chunk. That's OK in this case: the server Upper Layer typically marshals the Reply chunk contents in a single contiguous buffer, without a separate tail for the XDR pad. The comments and the variable naming refer to "chunks" but what is really meant is "segments." The existing code sends only one xdr_write_chunk per RPC reply. The fix assumes this as well. When the XDR pad in the first write chunk is reached, the assumption is the Write list is complete and send_write_chunks() returns. That will remain a valid assumption until the server Upper Layer can support multiple bulk payload results per RPC. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 969a1ab75fc3..bad5eaa9f812 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -342,6 +342,13 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, arg_ch->rs_handle, arg_ch->rs_offset, write_len); + + /* Do not send XDR pad bytes */ + if (chunk_no && write_len < 4) { + chunk_no++; + break; + } + chunk_off = 0; while (write_len) { ret = send_write(xprt, rqstp, -- cgit v1.2.3 From 6496500cf15f29ac8afc565e2e4b6f92a1324860 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Nov 2015 15:45:35 -0500 Subject: svcrpc: move some initialization to common code Minor cleanup, no change in behavior. Signed-off-by: J. Bruce Fields --- net/sunrpc/svcauth.c | 2 ++ net/sunrpc/svcauth_unix.c | 8 -------- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 79c0f3459b5c..69841db1f533 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -55,6 +55,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) spin_unlock(&authtab_lock); rqstp->rq_auth_slack = 0; + init_svc_cred(&rqstp->rq_cred); rqstp->rq_authop = aops; return aops->accept(rqstp, authp); @@ -63,6 +64,7 @@ EXPORT_SYMBOL_GPL(svc_authenticate); int svc_set_client(struct svc_rqst *rqstp) { + rqstp->rq_client = NULL; return rqstp->rq_authop->set_client(rqstp); } EXPORT_SYMBOL_GPL(svc_set_client); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 621ca7b4a155..dfacdc95b3f5 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -728,10 +728,6 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) struct kvec *resv = &rqstp->rq_res.head[0]; struct svc_cred *cred = &rqstp->rq_cred; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if (argv->iov_len < 3*4) return SVC_GARBAGE; @@ -794,10 +790,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) u32 slen, i; int len = argv->iov_len; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if ((len -= 3*4) < 0) return SVC_GARBAGE; -- cgit v1.2.3 From 414ca017a54d26c3a58ed1504884e51448d22ae1 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Nov 2015 10:48:02 -0500 Subject: nfsd4: fix gss-proxy 4.1 mounts for some AD principals The principal name on a gss cred is used to setup the NFSv4.0 callback, which has to have a client principal name to authenticate to. That code wants the name to be in the form servicetype@hostname. rpc.svcgssd passes down such names (and passes down no principal name at all in the case the principal isn't a service principal). gss-proxy always passes down the principal name, and passes it down in the form servicetype/hostname@REALM. So we've been munging the name gss-proxy passes down into the format the NFSv4.0 callback code expects, or throwing away the name if we can't. Since the introduction of the MACH_CRED enforcement in NFSv4.1, we've also been using the principal name to verify that certain operations are done as the same principal as was used on the original EXCHANGE_ID call. For that application, the original name passed down by gss-proxy is also useful. Lack of that name in some cases was causing some kerberized NFSv4.1 mount failures in an Active Directory environment. This fix only works in the gss-proxy case. The fix for legacy rpc.svcgssd would be more involved, and rpc.svcgssd already has other problems in the AD case. Reported-and-tested-by: James Ralston Acked-by: Simo Sorce Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_rpc_upcall.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 59eeed43eda2..f0c6a8c78a56 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -326,6 +326,9 @@ int gssp_accept_sec_context_upcall(struct net *net, if (data->found_creds && client_name.data != NULL) { char *c; + data->creds.cr_raw_principal = kstrndup(client_name.data, + client_name.len, GFP_KERNEL); + data->creds.cr_principal = kstrndup(client_name.data, client_name.len, GFP_KERNEL); if (data->creds.cr_principal) { -- cgit v1.2.3 From 38b95bcf122545db7035a06d79ec9e851be2e011 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 5 Nov 2015 11:37:08 +0300 Subject: xprtrdma: clean up some curly braces It doesn't matter either way, but the curly braces were clearly intended here. It causes a Smatch warning. Signed-off-by: Dan Carpenter Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index eadd1655145a..2cc101410a76 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -852,10 +852,11 @@ retry: if (extras) { rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); - if (rc) + if (rc) { pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", __func__, rc); rc = 0; + } } } -- cgit v1.2.3 From abfb689711aaebd14d893236c6ea4bcdfb61e74c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 5 Nov 2015 11:39:52 +0300 Subject: xprtrdma: checking for NULL instead of IS_ERR() The rpcrdma_create_req() function returns error pointers or success. It never returns NULL. Fixes: f531a5dbc451 ('xprtrdma: Pre-allocate backward rpc_rqst and send/receive buffers') Signed-off-by: Dan Carpenter Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 2dcb44f69e53..97554ca68191 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -42,8 +42,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, size_t size; req = rpcrdma_create_req(r_xprt); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); req->rl_backchannel = true; size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); -- cgit v1.2.3 From 9b06688bc3b9f13f8de90f832c455fddec3d4e8a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:22:06 -0500 Subject: xprtrdma: Fix additional uses of spin_lock_irqsave(rb_lock) Clean up. rb_lock critical sections added in rpcrdma_ep_post_extra_recv() should have first been converted to use normal spin_lock now that the reply handler is a work queue. The backchannel set up code should use the appropriate helper instead of open-coding a rb_recv_bufs list add. Problem introduced by glib patch re-ordering on my part. Fixes: f531a5dbc451 ('xprtrdma: Pre-allocate backward rpc_rqst') Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 6 +----- net/sunrpc/xprtrdma/verbs.c | 7 +++---- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 97554ca68191..40f48c62f9b1 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -84,9 +84,7 @@ out_fail: static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, unsigned int count) { - struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - unsigned long flags; int rc = 0; while (count--) { @@ -98,9 +96,7 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, break; } - spin_lock_irqsave(&buffers->rb_lock, flags); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + rpcrdma_recv_buffer_put(rep); } return rc; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2cc101410a76..003630733ef3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1338,15 +1338,14 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_rep *rep; - unsigned long flags; int rc; while (count--) { - spin_lock_irqsave(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); if (list_empty(&buffers->rb_recv_bufs)) goto out_reqbuf; rep = rpcrdma_buffer_get_rep_locked(buffers); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); rc = rpcrdma_ep_post_recv(ia, ep, rep); if (rc) @@ -1356,7 +1355,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) return 0; out_reqbuf: - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); pr_warn("%s: no extra receive buffers\n", __func__); return -ENOMEM; -- cgit v1.2.3 From ffc4d9b1596c34caa98962722e930e97912c8a9f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:22:14 -0500 Subject: xprtrdma: xprt_rdma_free() must not release backchannel reqs Preserve any rpcrdma_req that is attached to rpc_rqst's allocated for the backchannel. Otherwise, after all the pre-allocated backchannel req's are consumed, incoming backward calls start writing on freed memory. Somehow this hunk got lost. Fixes: f531a5dbc451 ('xprtrdma: Pre-allocate backward rpc_rqst') Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8c545f7d7525..740bddcf3488 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -576,6 +576,9 @@ xprt_rdma_free(void *buffer) rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); req = rb->rg_owner; + if (req->rl_backchannel) + return; + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); -- cgit v1.2.3 From c8bbe0c7fec3a6fd01d445eea11e72e902403ea9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:22:23 -0500 Subject: xprtrdma: Disable RPC/RDMA backchannel debugging messages Clean up. Fixes: 63cae47005af ('xprtrdma: Handle incoming backward direction') Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 40f48c62f9b1..cc1251d07297 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -15,7 +15,7 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -#define RPCRDMA_BACKCHANNEL_DEBUG +#undef RPCRDMA_BACKCHANNEL_DEBUG static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) @@ -136,6 +136,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) __func__); goto out_free; } + dprintk("RPC: %s: new rqst %p\n", __func__, rqst); rqst->rq_xprt = &r_xprt->rx_xprt; INIT_LIST_HEAD(&rqst->rq_list); @@ -216,12 +217,14 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) rpclen = rqst->rq_svec[0].iov_len; +#ifdef RPCRDMA_BACKCHANNEL_DEBUG pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n", __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf)); pr_info("RPC: %s: RPC/RDMA: %*ph\n", __func__, (int)RPCRDMA_HDRLEN_MIN, headerp); pr_info("RPC: %s: RPC: %*ph\n", __func__, (int)rpclen, rqst->rq_svec[0].iov_base); +#endif req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; @@ -265,6 +268,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; + dprintk("RPC: %s: freeing rqst %p (req %p)\n", + __func__, rqst, rpcr_to_rdmar(rqst)); + smp_mb__before_atomic(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); @@ -329,9 +335,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpc_rqst, rq_bc_pa_list); list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: using rqst %p\n", __func__, rqst); -#endif + dprintk("RPC: %s: using rqst %p\n", __func__, rqst); /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; @@ -351,10 +355,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, * direction reply. */ req = rpcr_to_rdmar(rqst); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: attaching rep %p to req %p\n", + dprintk("RPC: %s: attaching rep %p to req %p\n", __func__, rep, req); -#endif req->rl_reply = rep; /* Defeat the retransmit detection logic in send_request */ -- cgit v1.2.3 From 3cf4e169be95e1a3a70a063b6bd8103fbd5911f3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:22:31 -0500 Subject: xprtrdma: Move struct ib_send_wr off the stack For FRWR FASTREG and LOCAL_INV, move the ib_*_wr structure off the stack. This allows frwr_op_map and frwr_op_unmap to chain WRs together without limit to register or invalidate a set of MRs with a single ib_post_send(). (This will be for chaining LOCAL_INV requests). Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 38 ++++++++++++++++++++------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++++ 2 files changed, 24 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 88cf9e7269c2..31a45786137b 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -319,7 +319,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_mw *mw; struct rpcrdma_frmr *frmr; struct ib_mr *mr; - struct ib_reg_wr reg_wr; + struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; int rc, i, n, dma_nents; u8 key; @@ -336,6 +336,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, frmr = &mw->r.frmr; frmr->fr_state = FRMR_IS_VALID; mr = frmr->fr_mr; + reg_wr = &frmr->fr_regwr; if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; @@ -381,19 +382,19 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_id = (uintptr_t)mw; - reg_wr.wr.num_sge = 0; - reg_wr.wr.send_flags = 0; - reg_wr.mr = mr; - reg_wr.key = mr->rkey; - reg_wr.access = writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ; + reg_wr->wr.next = NULL; + reg_wr->wr.opcode = IB_WR_REG_MR; + reg_wr->wr.wr_id = (uintptr_t)mw; + reg_wr->wr.num_sge = 0; + reg_wr->wr.send_flags = 0; + reg_wr->mr = mr; + reg_wr->key = mr->rkey; + reg_wr->access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, ®_wr.wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); if (rc) goto out_senderr; @@ -423,23 +424,24 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mw *mw = seg1->rl_mw; struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_send_wr invalidate_wr, *bad_wr; + struct ib_send_wr *invalidate_wr, *bad_wr; int rc, nsegs = seg->mr_nsegs; dprintk("RPC: %s: FRMR %p\n", __func__, mw); seg1->rl_mw = NULL; frmr->fr_state = FRMR_IS_INVALID; + invalidate_wr = &mw->r.frmr.fr_invwr; - memset(&invalidate_wr, 0, sizeof(invalidate_wr)); - invalidate_wr.wr_id = (unsigned long)(void *)mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey; + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (uintptr_t)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); read_lock(&ia->ri_qplock); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); if (rc) goto out_err; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ac7f8d4f632a..5c1e0c600faf 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -207,6 +207,10 @@ struct rpcrdma_frmr { enum rpcrdma_frmr_state fr_state; struct work_struct fr_work; struct rpcrdma_xprt *fr_xprt; + union { + struct ib_reg_wr fr_regwr; + struct ib_send_wr fr_invwr; + }; }; struct rpcrdma_fmr { -- cgit v1.2.3 From 32d0ceecdfd0e941c492390fe5b6237cc1cf9fa6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:22:39 -0500 Subject: xprtrdma: Introduce ro_unmap_sync method In the current xprtrdma implementation, some memreg strategies implement ro_unmap synchronously (the MR is knocked down before the method returns) and some asynchonously (the MR will be knocked down and returned to the pool in the background). To guarantee the MR is truly invalid before the RPC consumer is allowed to resume execution, we need an unmap method that is always synchronous, invoked from the RPC/RDMA reply handler. The new method unmaps all MRs for an RPC. The existing ro_unmap method unmaps only one MR at a time. Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5c1e0c600faf..c32cba3f21fb 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -368,6 +368,8 @@ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { int (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); + void (*ro_unmap_sync)(struct rpcrdma_xprt *, + struct rpcrdma_req *); int (*ro_unmap)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *); int (*ro_open)(struct rpcrdma_ia *, -- cgit v1.2.3 From c9918ff56dfb175ce427140c641280d0b4522dbe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:22:47 -0500 Subject: xprtrdma: Add ro_unmap_sync method for FRWR FRWR's ro_unmap is asynchronous. The new ro_unmap_sync posts LOCAL_INV Work Requests and waits for them to complete before returning. Note also, DMA unmapping is now done _after_ invalidation. Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 136 ++++++++++++++++++++++++++++++++++++++-- net/sunrpc/xprtrdma/xprt_rdma.h | 2 + 2 files changed, 134 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 31a45786137b..c6836844bd0e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -245,12 +245,14 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); } -/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs + * to be reset. + * + * WARNING: Only wr_id and status are reliable at this point + */ static void -frwr_sendcompletion(struct ib_wc *wc) +__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_mw *r) { - struct rpcrdma_mw *r; - if (likely(wc->status == IB_WC_SUCCESS)) return; @@ -261,9 +263,23 @@ frwr_sendcompletion(struct ib_wc *wc) else pr_warn("RPC: %s: frmr %p error, status %s (%d)\n", __func__, r, ib_wc_status_msg(wc->status), wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; } +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + struct rpcrdma_frmr *f = &r->r.frmr; + + if (unlikely(wc->status != IB_WC_SUCCESS)) + __frwr_sendcompletion_flush(wc, r); + + if (f->fr_waiter) + complete(&f->fr_linv_done); +} + static int frwr_op_init(struct rpcrdma_xprt *r_xprt) { @@ -335,6 +351,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, } while (mw->r.frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->r.frmr; frmr->fr_state = FRMR_IS_VALID; + frmr->fr_waiter = false; mr = frmr->fr_mr; reg_wr = &frmr->fr_regwr; @@ -414,6 +431,116 @@ out_senderr: return rc; } +static struct ib_send_wr * +__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + struct ib_send_wr *invalidate_wr; + + f->fr_waiter = false; + f->fr_state = FRMR_IS_INVALID; + invalidate_wr = &f->fr_invwr; + + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (unsigned long)(void *)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey; + + return invalidate_wr; +} + +static void +__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int rc) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + + seg->rl_mw = NULL; + + ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir); + + if (!rc) + rpcrdma_put_mw(r_xprt, mw); + else + __frwr_queue_recovery(mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_frmr *f; + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * Chain the LOCAL_INV Work Requests and post them with + * a single ib_post_send() call. + */ + invalidate_wrs = pos = prev = NULL; + seg = NULL; + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + pos = __frwr_prepare_linv_wr(seg); + + if (!invalidate_wrs) + invalidate_wrs = pos; + else + prev->next = pos; + prev = pos; + + i += seg->mr_nsegs; + } + f = &seg->rl_mw->r.frmr; + + /* Strong send queue ordering guarantees that when the + * last WR in the chain completes, all WRs in the chain + * are complete. + */ + f->fr_invwr.send_flags = IB_SEND_SIGNALED; + f->fr_waiter = true; + init_completion(&f->fr_linv_done); + INIT_CQCOUNT(&r_xprt->rx_ep); + + /* Transport disconnect drains the receive CQ before it + * replaces the QP. The RPC reply handler won't call us + * unless ri_id->qp is a valid pointer. + */ + rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); + if (rc) + pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + + wait_for_completion(&f->fr_linv_done); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __frwr_dma_unmap(r_xprt, seg, rc); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Post a LOCAL_INV Work Request to prevent further remote access * via RDMA READ or RDMA WRITE. */ @@ -473,6 +600,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap = frwr_op_unmap, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c32cba3f21fb..ddae4909982b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -207,6 +207,8 @@ struct rpcrdma_frmr { enum rpcrdma_frmr_state fr_state; struct work_struct fr_work; struct rpcrdma_xprt *fr_xprt; + bool fr_waiter; + struct completion fr_linv_done;; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; -- cgit v1.2.3 From 7c7a5390dc6c8d89fc368424b69a4eef8e43f411 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:22:55 -0500 Subject: xprtrdma: Add ro_unmap_sync method for FMR FMR's ro_unmap method is already synchronous because ib_unmap_fmr() is a synchronous verb. However, some improvements can be made here. 1. Gather all the MRs for the RPC request onto a list, and invoke ib_unmap_fmr() once with that list. This reduces the number of doorbells when there is more than one MR to invalidate 2. Perform the DMA unmap _after_ the MRs are unmapped, not before. This is critical after invalidating a Write chunk. Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 64 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index f1e8dafbd507..c14f3a4bff68 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -179,6 +179,69 @@ out_maperr: return rc; } +static void +__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + int nsegs = seg->mr_nsegs; + + seg->rl_mw = NULL; + + while (nsegs--) + rpcrdma_unmap_one(device, seg++); + + rpcrdma_put_mw(r_xprt, mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_mw *mw; + LIST_HEAD(unmap_list); + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * ib_unmap_fmr() is slow, so use a single call instead + * of one call per mapped MR. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; + + list_add(&mw->r.fmr.fmr->list, &unmap_list); + + i += seg->mr_nsegs; + } + rc = ib_unmap_fmr(&unmap_list); + if (rc) + pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __fmr_dma_unmap(r_xprt, seg); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Use the ib_unmap_fmr() verb to prevent further remote * access via RDMA READ or RDMA WRITE. */ @@ -231,6 +294,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, + .ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap = fmr_op_unmap, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, -- cgit v1.2.3 From 73eee9b2de1fa08f2a82bb32ac4ec5e605716a91 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:23:03 -0500 Subject: xprtrdma: Add ro_unmap_sync method for all-physical registration physical's ro_unmap is synchronous already. The new ro_unmap_sync method just has to DMA unmap all MRs associated with the RPC request. Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/physical_ops.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 617b76f22154..dbb302ecf590 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -83,6 +83,18 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) return 1; } +/* DMA unmap all memory regions that were mapped for "req". + */ +static void +physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + unsigned int i; + + for (i = 0; req->rl_nchunks; --req->rl_nchunks) + rpcrdma_unmap_one(device, &req->rl_segments[i++]); +} + static void physical_op_destroy(struct rpcrdma_buffer *buf) { @@ -90,6 +102,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, + .ro_unmap_sync = physical_op_unmap_sync, .ro_unmap = physical_op_unmap, .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, -- cgit v1.2.3 From 68791649a725ac58c88b472ea6187853e67b3415 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:23:11 -0500 Subject: xprtrdma: Invalidate in the RPC reply handler There is a window between the time the RPC reply handler wakes the waiting RPC task and when xprt_release() invokes ops->buf_free. During this time, memory regions containing the data payload may still be accessed by a broken or malicious server, but the RPC application has already been allowed access to the memory containing the RPC request's data payloads. The server should be fenced from client memory containing RPC data payloads _before_ the RPC application is allowed to continue. This change also more strongly enforces send queue accounting. There is a maximum number of RPC calls allowed to be outstanding. When an RPC/RDMA transport is set up, just enough send queue resources are allocated to handle registration, Send, and invalidation WRs for each those RPCs at the same time. Before, additional RPC calls could be dispatched while invalidation WRs were still consuming send WQEs. When invalidation WRs backed up, dispatching additional RPCs resulted in a send queue overrun. Now, the reply handler prevents RPC dispatch until invalidation is complete. This prevents RPC call dispatch until there are enough send queue resources to proceed. Still to do: If an RPC exits early (say, ^C), the reply handler has no opportunity to perform invalidation. Currently, xprt_rdma_free() still frees remaining RDMA resources, which could deadlock. Additional changes are needed to handle invalidation properly in this case. Reported-by: Jason Gunthorpe Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c10d9699441c..0f28f2d743ed 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -804,6 +804,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (req->rl_reply) goto out_duplicate; + /* Sanity checking has passed. We are now committed + * to complete this transaction. + */ + list_del_init(&rqst->rq_list); + spin_unlock_bh(&xprt->transport_lock); dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" " RPC request 0x%p xid 0x%08x\n", __func__, rep, req, rqst, @@ -888,12 +893,23 @@ badheader: break; } + /* Invalidate and flush the data payloads before waking the + * waiting application. This guarantees the memory region is + * properly fenced from the server before the application + * accesses the data. It also ensures proper send flow + * control: waking the next RPC waits until this RPC has + * relinquished all its Send Queue entries. + */ + if (req->rl_nchunks) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); + credits = be32_to_cpu(headerp->rm_credit); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_max_requests) credits = r_xprt->rx_buf.rb_max_requests; + spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) -- cgit v1.2.3 From 26ae9d1c5af1b1d669ca1c28fc02bbca3d778d45 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Dec 2015 17:23:20 -0500 Subject: xprtrdma: Revert commit e7104a2a9606 ('xprtrdma: Cap req_cqinit'). The root of the problem was that sends (especially unsignalled FASTREG and LOCAL_INV Work Requests) were not properly flow- controlled, which allowed a send queue overrun. Now that the RPC/RDMA reply handler waits for invalidation to complete, the send queue is properly flow-controlled. Thus this limit is no longer necessary. Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 6 ++---- net/sunrpc/xprtrdma/xprt_rdma.h | 6 ------ 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 003630733ef3..732c71ce5dca 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -616,10 +616,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* set trigger for requesting send completion */ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; - if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS) - ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS; - else if (ep->rep_cqinit <= 2) - ep->rep_cqinit = 0; + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; /* always signal? */ INIT_CQCOUNT(ep); init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ddae4909982b..728101ddc44b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -88,12 +88,6 @@ struct rpcrdma_ep { struct delayed_work rep_connect_worker; }; -/* - * Force a signaled SEND Work Request every so often, - * in case the provider needs to do some housekeeping. - */ -#define RPCRDMA_MAX_UNSIGNALED_SENDS (32) - #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) -- cgit v1.2.3 From c3d4879e01bec484f50a78c108341f039d470e96 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 11 Dec 2015 16:45:58 -0500 Subject: sunrpc: Add a function to close temporary transports immediately Add a function svc_age_temp_xprts_now() to close temporary transports whose xpt_local matches the address passed in server_addr immediately instead of waiting for them to be closed by the timer function. The function is intended to be used by notifier_blocks that will be added to nfsd and lockd that will run when an ip address is deleted. This will eliminate the ACK storms and client hangs that occur in HA-NFS configurations where nfsd & lockd is left running on the cluster nodes all the time and the NFS 'service' is migrated back and forth within a short timeframe. Signed-off-by: Scott Mayhew Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index a6cbb2104667..7422f28818b2 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -10,11 +10,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -938,6 +940,49 @@ static void svc_age_temp_xprts(unsigned long closure) mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } +/* Close temporary transports whose xpt_local matches server_addr immediately + * instead of waiting for them to be picked up by the timer. + * + * This is meant to be called from a notifier_block that runs when an ip + * address is deleted. + */ +void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) +{ + struct svc_xprt *xprt; + struct svc_sock *svsk; + struct socket *sock; + struct list_head *le, *next; + LIST_HEAD(to_be_closed); + struct linger no_linger = { + .l_onoff = 1, + .l_linger = 0, + }; + + spin_lock_bh(&serv->sv_lock); + list_for_each_safe(le, next, &serv->sv_tempsocks) { + xprt = list_entry(le, struct svc_xprt, xpt_list); + if (rpc_cmp_addr(server_addr, (struct sockaddr *) + &xprt->xpt_local)) { + dprintk("svc_age_temp_xprts_now: found %p\n", xprt); + list_move(le, &to_be_closed); + } + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_closed)) { + le = to_be_closed.next; + list_del_init(le); + xprt = list_entry(le, struct svc_xprt, xpt_list); + dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); + svsk = container_of(xprt, struct svc_sock, sk_xprt); + sock = svsk->sk_sock; + kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&no_linger, sizeof(no_linger)); + svc_close_xprt(xprt); + } +} +EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now); + static void call_xpt_users(struct svc_xprt *xprt) { struct svc_xpt_user *u; -- cgit v1.2.3 From d1358917f2eb530bc6a097937302282a428806f8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 2 Dec 2015 14:17:52 +0800 Subject: SUNRPC: drop unused xs_reclassify_socketX() helpers xs_reclassify_socket4() and friends used to be called directly. xs_reclassify_socket() is called instead nowadays. The xs_reclassify_socketX() helper functions are empty when CONFIG_DEBUG_LOCK_ALLOC is not defined. Drop them since they have no callers. Note that AF_LOCAL still calls xs_reclassify_socketu() directly but is easily converted to generic xs_reclassify_socket(). Signed-off-by: Stefan Hajnoczi Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2ffaf6a79499..70c13d675dc1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1907,18 +1907,6 @@ static inline void xs_reclassify_socket(int family, struct socket *sock) } } #else -static inline void xs_reclassify_socketu(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket4(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket6(struct socket *sock) -{ -} - static inline void xs_reclassify_socket(int family, struct socket *sock) { } @@ -2008,7 +1996,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) "transport socket (%d).\n", -status); goto out; } - xs_reclassify_socketu(sock); + xs_reclassify_socket(AF_LOCAL, sock); dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); -- cgit v1.2.3 From 0b161e6330e27c191a0ff0d44082ff7832a8c8a1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 30 Dec 2015 18:14:06 -0500 Subject: SUNRPC: Fix a missing break in rpc_anyaddr() The missing break means that we always return EAFNOSUPPORT when faced with a request for an IPv6 loopback address. Reported-by: coverity (CID 401987) Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 23608eb0ded2..b7f21044f4d8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1217,6 +1217,7 @@ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen) return -EINVAL; memcpy(buf, &rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); + break; default: dprintk("RPC: %s: address family not supported\n", __func__); -- cgit v1.2.3 From 13331a551ab4df87f7a027d2cab392da96aba1de Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 6 Jan 2016 08:57:06 -0500 Subject: SUNRPC: Fixup socket wait for memory We're seeing hangs in the NFS client code, with loops of the form: RPC: 30317 xmit incomplete (267368 left of 524448) RPC: 30317 call_status (status -11) RPC: 30317 call_transmit (status 0) RPC: 30317 xprt_prepare_transmit RPC: 30317 xprt_transmit(524448) RPC: xs_tcp_send_request(267368) = -11 RPC: 30317 xmit incomplete (267368 left of 524448) RPC: 30317 call_status (status -11) RPC: 30317 call_transmit (status 0) RPC: 30317 xprt_prepare_transmit RPC: 30317 xprt_transmit(524448) Turns out commit ceb5d58b2170 ("net: fix sock_wake_async() rcu protection") moved SOCKWQ_ASYNC_NOSPACE out of sock->flags and into sk->sk_wq->flags, however it never tried to fix up the code in net/sunrpc. The new idiom is to use the flags in the RCU protected struct socket_wq. While we're at it, clear out the now redundant places where we set/clear SOCKWQ_ASYNC_NOSPACE and SOCK_NOSPACE. In principle, sk_stream_wait_memory() is supposed to set these for us, so we only need to clear them in the particular case of our ->write_space() callback. Fixes: ceb5d58b2170 ("net: fix sock_wake_async() rcu protection") Cc: Eric Dumazet Cc: stable@vger.kernel.org # 4.4 Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2ffaf6a79499..027c9ef8a263 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -398,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (unlikely(!sock)) return -ENOTSOCK; - clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags); if (base != 0) { addr = NULL; addrlen = 0; @@ -442,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task) struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); transport->inet->sk_write_pending--; - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } /** @@ -467,20 +465,11 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { - if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) { - /* - * Notify TCP that we're limited by the application - * window size - */ - set_bit(SOCK_NOSPACE, &transport->sock->flags); - sk->sk_write_pending++; - /* ...and wait for more buffer space */ - xprt_wait_for_buffer_space(task, xs_nospace_callback); - } - } else { - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + /* wait for more buffer space */ + sk->sk_write_pending++; + xprt_wait_for_buffer_space(task, xs_nospace_callback); + } else ret = -ENOTCONN; - } spin_unlock_bh(&xprt->transport_lock); @@ -616,9 +605,6 @@ process_status: case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ENETUNREACH: case -ENOBUFS: case -EPIPE: @@ -626,7 +612,10 @@ process_status: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -706,16 +695,16 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ECONNRESET: case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: case -ENOBUFS: case -EPIPE: - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -1609,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk) static void xs_write_space(struct sock *sk) { - struct socket *sock; + struct socket_wq *wq; struct rpc_xprt *xprt; - if (unlikely(!(sock = sk->sk_socket))) + if (!sk->sk_socket) return; - clear_bit(SOCK_NOSPACE, &sock->flags); + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (unlikely(!(xprt = xprt_from_sock(sk)))) return; - if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0) - return; + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) + goto out; xprt_write_space(xprt); +out: + rcu_read_unlock(); } /** -- cgit v1.2.3 From 3daa020f9bf803c03c6b6c895921e2b09fcd494a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 23 Dec 2015 15:08:08 -0500 Subject: Revert "svcrdma: Do not send XDR roundup bytes for a write chunk" This reverts commit 6f18dc893981e4daab29221d6a9771f3ce2dd8c5. Just as one example, it appears this code could do the wrong thing in the case of a two-byte NFS READ that crosses a page boundary. Chuck says: "In that case, nfsd would pass down an xdr_buf that has one byte in a page, one byte in another page, and a two-byte XDR pad. The logic introduced by this optimization would be fooled, and neither the second byte nor the XDR pad would be written to the client." Cc: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index bad5eaa9f812..969a1ab75fc3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -342,13 +342,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, arg_ch->rs_handle, arg_ch->rs_offset, write_len); - - /* Do not send XDR pad bytes */ - if (chunk_no && write_len < 4) { - chunk_no++; - break; - } - chunk_off = 0; while (write_len) { ret = send_write(xprt, rqstp, -- cgit v1.2.3 From af63cf51b7f960aa73b32bac683cd4078f08fa0e Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 30 Nov 2015 17:34:01 +0100 Subject: batman-adv: fix lockdep splat when doing mcast_free While testing, we got something like this: WARNING: CPU: 0 PID: 238 at net/batman-adv/multicast.c:142 batadv_mcast_mla_tt_retract+0x94/0x205 [batman_adv]() [...] Call Trace: [] dump_stack+0x4b/0x64 [] warn_slowpath_common+0xbc/0x120 [] ? batadv_mcast_mla_tt_retract+0x94/0x205 [batman_adv] [] warn_slowpath_null+0x15/0x20 [] batadv_mcast_mla_tt_retract+0x94/0x205 [batman_adv] [] batadv_mcast_free+0x36/0x39 [batman_adv] [] batadv_mesh_free+0x7d/0x13f [batman_adv] [] batadv_softif_free+0x15/0x25 [batman_adv] [...] Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/multicast.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index eb76386f8d4b..75fa5013af72 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -802,7 +802,9 @@ void batadv_mcast_free(struct batadv_priv *bat_priv) batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1); + spin_lock_bh(&bat_priv->tt.commit_lock); batadv_mcast_mla_tt_retract(bat_priv, NULL); + spin_unlock_bh(&bat_priv->tt.commit_lock); } /** -- cgit v1.2.3 From bab7c6c3deac70966a3000402c0ea6d0c20edd15 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 5 Jan 2016 12:06:17 +0100 Subject: batman-adv: Fix list removal of batadv_hardif_neigh_node The neigh_list with batadv_hardif_neigh_node objects is accessed with only rcu_read_lock in batadv_hardif_neigh_get and batadv_iv_neigh_print. Thus it is not allowed to kfree the object before the rcu grace period ends (which may still protects context accessing this object). Therefore the object has first to be removed from the neigh_list and then it has either wait with synchronize_rcu or call_rcu till the grace period ends before it can be freed. Fixes: cef63419f7db ("batman-adv: add list of unique single hop neighbors per hard-interface") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/originator.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 3c782a33bdac..ae6d18cafc5a 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -211,10 +211,6 @@ static void batadv_hardif_neigh_free_rcu(struct rcu_head *rcu) hardif_neigh = container_of(rcu, struct batadv_hardif_neigh_node, rcu); - spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); - hlist_del_init_rcu(&hardif_neigh->list); - spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); - batadv_hardif_free_ref_now(hardif_neigh->if_incoming); kfree(hardif_neigh); } @@ -227,8 +223,13 @@ static void batadv_hardif_neigh_free_rcu(struct rcu_head *rcu) static void batadv_hardif_neigh_free_now(struct batadv_hardif_neigh_node *hardif_neigh) { - if (atomic_dec_and_test(&hardif_neigh->refcount)) + if (atomic_dec_and_test(&hardif_neigh->refcount)) { + spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); + hlist_del_init_rcu(&hardif_neigh->list); + spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); + batadv_hardif_neigh_free_rcu(&hardif_neigh->rcu); + } } /** @@ -238,8 +239,13 @@ batadv_hardif_neigh_free_now(struct batadv_hardif_neigh_node *hardif_neigh) */ void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh) { - if (atomic_dec_and_test(&hardif_neigh->refcount)) + if (atomic_dec_and_test(&hardif_neigh->refcount)) { + spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); + hlist_del_init_rcu(&hardif_neigh->list); + spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); + call_rcu(&hardif_neigh->rcu, batadv_hardif_neigh_free_rcu); + } } /** -- cgit v1.2.3 From b8e429a2feac623a34e21099a4a69de29b6d873e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 13 Jan 2016 10:28:06 -0500 Subject: genetlink: Fix off-by-one in genl_allocate_reserve_groups() The bug fix for adding n_groups to the computation forgot to adjust ">=" to ">" to keep the condition correct. Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d3f6b063467b..f830326b3b1d 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -185,7 +185,7 @@ static int genl_allocate_reserve_groups(int n_groups, int *first_id) } } - if (id + n_groups >= mc_groups_longs * BITS_PER_LONG) { + if (id + n_groups > mc_groups_longs * BITS_PER_LONG) { unsigned long new_longs = mc_groups_longs + BITS_TO_LONGS(n_groups); size_t nlen = new_longs * sizeof(unsigned long); -- cgit v1.2.3 From 5d097056c9a017a3b720849efb5432f37acabbac Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:21 -0800 Subject: kmemcg: account certain kmem allocations to memcg Mark those kmem allocations that are known to be easily triggered from userspace as __GFP_ACCOUNT/SLAB_ACCOUNT, which makes them accounted to memcg. For the list, see below: - threadinfo - task_struct - task_delay_info - pid - cred - mm_struct - vm_area_struct and vm_region (nommu) - anon_vma and anon_vma_chain - signal_struct - sighand_struct - fs_struct - files_struct - fdtable and fdtable->full_fds_bits - dentry and external_name - inode for all filesystems. This is the most tedious part, because most filesystems overwrite the alloc_inode method. The list is far from complete, so feel free to add more objects. Nevertheless, it should be close to "account everything" approach and keep most workloads within bounds. Malevolent users will be able to breach the limit, but this was possible even with the former "account everything" approach (simply because it did not account everything in fact). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/socket.c | 2 +- net/sunrpc/rpc_pipe.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 91c2de6f5020..c044d1e8508c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -294,7 +294,7 @@ static int init_inodecache(void) 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); if (sock_inode_cachep == NULL) return -ENOMEM; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d81186d34558..14f45bf0410c 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void) rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", sizeof(struct rpc_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!rpc_inode_cachep) return -ENOMEM; -- cgit v1.2.3 From 9ee11ba4251dddf1b0e507d184b25b1bd7820773 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:19:41 -0800 Subject: memcg: do not allow to disable tcp accounting after limit is set There are two bits defined for cg_proto->flags - MEMCG_SOCK_ACTIVATED and MEMCG_SOCK_ACTIVE - both are set in tcp_update_limit, but the former is never cleared while the latter can be cleared by unsetting the limit. This allows to disable tcp socket accounting for new sockets after it was enabled by writing -1 to memory.kmem.tcp.limit_in_bytes while still guaranteeing that memcg_socket_limit_enabled static key will be decremented on memcg destruction. This functionality looks dubious, because it is not clear what a use case would be. By enabling tcp accounting a user accepts the price. If they then find the performance degradation unacceptable, they can always restart their workload with tcp accounting disabled. It does not seem there is any need to flip it while the workload is running. Besides, it contradicts to how kmem accounting API works: writing whatever to memory.kmem.limit_in_bytes enables kmem accounting for the cgroup in question, after which it cannot be disabled. Therefore one might expect that writing -1 to memory.kmem.tcp.limit_in_bytes just enables socket accounting w/o limiting it, which might be useful by itself, but it isn't true. Since this API peculiarity is not documented anywhere, I propose to drop it. This will allow to simplify the code by dropping cg_proto->flags. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/ipv4/tcp_memcontrol.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 2379c1b4efb2..d07579ada001 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -48,7 +48,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) percpu_counter_destroy(&cg_proto->sockets_allocated); - if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) + if (cg_proto->active) static_key_slow_dec(&memcg_socket_limit_enabled); } @@ -72,11 +72,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) cg_proto->sysctl_mem[i] = min_t(long, nr_pages, sysctl_tcp_mem[i]); - if (nr_pages == PAGE_COUNTER_MAX) - clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else { + if (!cg_proto->active) { /* - * The active bit needs to be written after the static_key + * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation * function is the last one to run. See sock_update_memcg() for * details, and note that we don't mark any socket as belonging @@ -90,14 +88,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) * We never race with the readers in sock_update_memcg(), * because when this value change, the code to process it is not * patched in yet. - * - * The activated bit is used to guarantee that no two writers - * will do the update in the same memcg. Without that, we can't - * properly shutdown the static key. */ - if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_inc(&memcg_socket_limit_enabled); - set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); + static_key_slow_inc(&memcg_socket_limit_enabled); + cg_proto->active = true; } return 0; -- cgit v1.2.3 From 3d596f7b907b0281b997cf30c92994a71ad0a1a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:05 -0800 Subject: net: tcp_memcontrol: protect all tcp_memcontrol calls by jump-label Move the jump-label from sock_update_memcg() and sock_release_memcg() to the callsite, and so eliminate those function calls when socket accounting is not enabled. This also eliminates the need for dummy functions because the calls will be optimized away if the Kconfig options are not enabled. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/sock.c | 9 ++------- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_ipv4.c | 4 +++- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 51270238e269..6c5dab01105b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1507,12 +1507,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -static void sk_update_clone(const struct sock *sk, struct sock *newsk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - sock_update_memcg(newsk); -} - /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@ -1607,7 +1601,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - sk_update_clone(sk, newsk); + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + sock_update_memcg(newsk); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7bb1b091efd1..fd17eec93525 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -422,7 +422,8 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - sock_update_memcg(sk); + if (mem_cgroup_sockets_enabled) + sock_update_memcg(sk); sk_sockets_allocated_inc(sk); local_bh_enable(); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 65947c1f4733..eb39e02899e5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1818,7 +1818,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); - sock_release_memcg(sk); + + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); -- cgit v1.2.3 From af95d7df4059cfeab7e7c244f3564214aada7dad Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:08 -0800 Subject: net: tcp_memcontrol: remove dead per-memcg count of allocated sockets The number of allocated sockets is used for calculations in the soft limit phase, where packets are accepted but the socket is under memory pressure. Since there is no soft limit phase in tcp_memcontrol, and memory pressure is only entered when packets are already dropped, this is actually dead code. Remove it. As this is the last user of parent_cg_proto(), remove that too. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/ipv4/tcp_memcontrol.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index d07579ada001..6759e0d6bba1 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -32,7 +32,6 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) counter_parent = &parent_cg->memory_allocated; page_counter_init(&cg_proto->memory_allocated, counter_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); return 0; } @@ -46,8 +45,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) if (!cg_proto) return; - percpu_counter_destroy(&cg_proto->sockets_allocated); - if (cg_proto->active) static_key_slow_dec(&memcg_socket_limit_enabled); -- cgit v1.2.3 From 80f23124f57c77915a7b4201d8dcba38a38b23f0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:11 -0800 Subject: net: tcp_memcontrol: simplify the per-memcg limit access tcp_memcontrol replicates the global sysctl_mem limit array per cgroup, but it only ever sets these entries to the value of the memory_allocated page_counter limit. Use the latter directly. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/ipv4/tcp_memcontrol.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 6759e0d6bba1..ef4268d12e43 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -21,9 +21,6 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) if (!cg_proto) return 0; - cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; - cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; - cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; cg_proto->memory_pressure = 0; cg_proto->memcg = memcg; @@ -54,7 +51,6 @@ EXPORT_SYMBOL(tcp_destroy_cgroup); static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) { struct cg_proto *cg_proto; - int i; int ret; cg_proto = tcp_prot.proto_cgroup(memcg); @@ -65,10 +61,6 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) if (ret) return ret; - for (i = 0; i < 3; i++) - cg_proto->sysctl_mem[i] = min_t(long, nr_pages, - sysctl_tcp_mem[i]); - if (!cg_proto->active) { /* * The active flag needs to be written after the static_key -- cgit v1.2.3 From e805605c721021879a1469bdae45c6f80bc985f4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:14 -0800 Subject: net: tcp_memcontrol: sanitize tcp memory accounting callbacks There won't be a tcp control soft limit, so integrating the memcg code into the global skmem limiting scheme complicates things unnecessarily. Replace this with simple and clear charge and uncharge calls--hidden behind a jump label--to account skb memory. Note that this is not purely aesthetic: as a result of shoehorning the per-memcg code into the same memory accounting functions that handle the global level, the old code would compare the per-memcg consumption against the smaller of the per-memcg limit and the global limit. This allowed the total consumption of multiple sockets to exceed the global limit, as long as the individual sockets stayed within bounds. After this change, the code will always compare the per-memcg consumption to the per-memcg limit, and the global consumption to the global limit, and thus close this loophole. Without a soft limit, the per-memcg memory pressure state in sockets is generally questionable. However, we did it until now, so we continue to enter it when the hard limit is hit, and packets are dropped, to let other sockets in the cgroup know that they shouldn't grow their transmit windows, either. However, keep it simple in the new callback model and leave memory pressure lazily when the next packet is accepted (as opposed to doing it synchroneously when packets are processed). When packets are dropped, network performance will already be in the toilet, so that should be a reasonable trade-off. As described above, consumption is now checked on the per-memcg level and the global level separately. Likewise, memory pressure states are maintained on both the per-memcg level and the global level, and a socket is considered under pressure when either level asserts as much. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/sock.c | 26 ++++++++++++++++---------- net/ipv4/tcp_output.c | 7 +++++-- 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 6c5dab01105b..89ae859d2dc5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2084,27 +2084,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); long allocated; - int parent_status = UNDER_LIMIT; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = sk_memory_allocated_add(sk, amt, &parent_status); + allocated = sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_cgrp && + !mem_cgroup_charge_skmem(sk->sk_cgrp, amt)) + goto suppress_allocation; /* Under limit. */ - if (parent_status == UNDER_LIMIT && - allocated <= sk_prot_mem_limits(sk, 0)) { + if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } - /* Under pressure. (we or our parents) */ - if ((parent_status > SOFT_LIMIT) || - allocated > sk_prot_mem_limits(sk, 1)) + /* Under pressure. */ + if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); - /* Over hard limit (we or our parents) */ - if ((parent_status == OVER_LIMIT) || - (allocated > sk_prot_mem_limits(sk, 2))) + /* Over hard limit. */ + if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ @@ -2153,6 +2153,9 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + mem_cgroup_uncharge_skmem(sk->sk_cgrp, amt); + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -2168,6 +2171,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + mem_cgroup_uncharge_skmem(sk->sk_cgrp, amount); + if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 412a920fe0ec..493b48945f0c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2813,13 +2813,16 @@ begin_fwd: */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt, status; + int amt; if (size <= sk->sk_forward_alloc) return; amt = sk_mem_pages(size); sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - sk_memory_allocated_add(sk, amt, &status); + sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + mem_cgroup_charge_skmem(sk->sk_cgrp, amt); } /* Send a FIN. The caller locks the socket for us. -- cgit v1.2.3 From baac50bbc3cdfd184ebf586b1704edbfcee866df Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:17 -0800 Subject: net: tcp_memcontrol: simplify linkage between socket and page counter There won't be any separate counters for socket memory consumed by protocols other than TCP in the future. Remove the indirection and link sockets directly to their owning memory cgroup. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/sock.c | 52 +++++------------------------------- net/ipv4/tcp_ipv4.c | 7 +---- net/ipv4/tcp_memcontrol.c | 67 ++++++++++++++++++----------------------------- net/ipv4/tcp_output.c | 4 +-- net/ipv6/tcp_ipv6.c | 3 --- 5 files changed, 36 insertions(+), 97 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 89ae859d2dc5..3535bffa45f3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -195,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap) } EXPORT_SYMBOL(sk_net_capable); - -#ifdef CONFIG_MEMCG_KMEM -int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - struct proto *proto; - int ret = 0; - - mutex_lock(&proto_list_mutex); - list_for_each_entry(proto, &proto_list, node) { - if (proto->init_cgroup) { - ret = proto->init_cgroup(memcg, ss); - if (ret) - goto out; - } - } - - mutex_unlock(&proto_list_mutex); - return ret; -out: - list_for_each_entry_continue_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); - return ret; -} - -void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) -{ - struct proto *proto; - - mutex_lock(&proto_list_mutex); - list_for_each_entry_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); -} -#endif - /* * Each address family might have different locking rules, so we have * one slock key per address family: @@ -1601,7 +1563,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + if (mem_cgroup_sockets_enabled && sk->sk_memcg) sock_update_memcg(newsk); if (newsk->sk_prot->sockets_allocated) @@ -2089,8 +2051,8 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) allocated = sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_cgrp && - !mem_cgroup_charge_skmem(sk->sk_cgrp, amt)) + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) goto suppress_allocation; /* Under limit. */ @@ -2153,8 +2115,8 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - mem_cgroup_uncharge_skmem(sk->sk_cgrp, amt); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); return 0; } @@ -2171,8 +2133,8 @@ void __sk_mem_reclaim(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - mem_cgroup_uncharge_skmem(sk->sk_cgrp, amount); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index eb39e02899e5..c7d1fb50f381 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1819,7 +1819,7 @@ void tcp_v4_destroy_sock(struct sock *sk) sk_sockets_allocated_dec(sk); - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + if (mem_cgroup_sockets_enabled && sk->sk_memcg) sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -2343,11 +2343,6 @@ struct proto tcp_prot = { #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, -#endif -#ifdef CONFIG_MEMCG_KMEM - .init_cgroup = tcp_init_cgroup, - .destroy_cgroup = tcp_destroy_cgroup, - .proto_cgroup = tcp_proto_cgroup, #endif .diag_destroy = tcp_abort, }; diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index ef4268d12e43..e5078259cbe3 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -8,60 +8,47 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct page_counter *counter_parent = NULL; /* * The root cgroup does not use page_counters, but rather, * rely on the data already collected by the network * subsystem */ - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct page_counter *counter_parent = NULL; - struct cg_proto *cg_proto, *parent_cg; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return 0; - cg_proto->memory_pressure = 0; - cg_proto->memcg = memcg; + memcg->tcp_mem.memory_pressure = 0; - parent_cg = tcp_prot.proto_cgroup(parent); - if (parent_cg) - counter_parent = &parent_cg->memory_allocated; + if (parent) + counter_parent = &parent->tcp_mem.memory_allocated; - page_counter_init(&cg_proto->memory_allocated, counter_parent); + page_counter_init(&memcg->tcp_mem.memory_allocated, counter_parent); return 0; } -EXPORT_SYMBOL(tcp_init_cgroup); void tcp_destroy_cgroup(struct mem_cgroup *memcg) { - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return; - if (cg_proto->active) + if (memcg->tcp_mem.active) static_key_slow_dec(&memcg_socket_limit_enabled); - } -EXPORT_SYMBOL(tcp_destroy_cgroup); static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) { - struct cg_proto *cg_proto; int ret; - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return -EINVAL; - ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); + ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, nr_pages); if (ret) return ret; - if (!cg_proto->active) { + if (!memcg->tcp_mem.active) { /* * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation @@ -79,7 +66,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) * patched in yet. */ static_key_slow_inc(&memcg_socket_limit_enabled); - cg_proto->active = true; + memcg->tcp_mem.active = true; } return 0; @@ -123,32 +110,32 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); u64 val; switch (cft->private) { case RES_LIMIT: - if (!cg_proto) - return PAGE_COUNTER_MAX; - val = cg_proto->memory_allocated.limit; + if (memcg == root_mem_cgroup) + val = PAGE_COUNTER_MAX; + else + val = memcg->tcp_mem.memory_allocated.limit; val *= PAGE_SIZE; break; case RES_USAGE: - if (!cg_proto) + if (memcg == root_mem_cgroup) val = atomic_long_read(&tcp_memory_allocated); else - val = page_counter_read(&cg_proto->memory_allocated); + val = page_counter_read(&memcg->tcp_mem.memory_allocated); val *= PAGE_SIZE; break; case RES_FAILCNT: - if (!cg_proto) + if (memcg == root_mem_cgroup) return 0; - val = cg_proto->memory_allocated.failcnt; + val = memcg->tcp_mem.memory_allocated.failcnt; break; case RES_MAX_USAGE: - if (!cg_proto) + if (memcg == root_mem_cgroup) return 0; - val = cg_proto->memory_allocated.watermark; + val = memcg->tcp_mem.memory_allocated.watermark; val *= PAGE_SIZE; break; default: @@ -161,19 +148,17 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg; - struct cg_proto *cg_proto; memcg = mem_cgroup_from_css(of_css(of)); - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return nbytes; switch (of_cft(of)->private) { case RES_MAX_USAGE: - page_counter_reset_watermark(&cg_proto->memory_allocated); + page_counter_reset_watermark(&memcg->tcp_mem.memory_allocated); break; case RES_FAILCNT: - cg_proto->memory_allocated.failcnt = 0; + memcg->tcp_mem.memory_allocated.failcnt = 0; break; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 493b48945f0c..fda379cd600d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2821,8 +2821,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - mem_cgroup_charge_skmem(sk->sk_cgrp, amt); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_charge_skmem(sk->sk_memcg, amt); } /* Send a FIN. The caller locks the socket for us. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index db9f1c318afc..4ad8edb46f7c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1888,9 +1888,6 @@ struct proto tcpv6_prot = { #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, -#endif -#ifdef CONFIG_MEMCG_KMEM - .proto_cgroup = tcp_proto_cgroup, #endif .clear_sk = tcp_v6_clear_sk, .diag_destroy = tcp_abort, -- cgit v1.2.3 From 80e95fe0fdcde2812c341ad4209d62dc1a7af53b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:20 -0800 Subject: mm: memcontrol: generalize the socket accounting jump label The unified hierarchy memory controller is going to use this jump label as well to control the networking callbacks. Move it to the memory controller code and give it a more generic name. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/sock.c | 5 ----- net/ipv4/tcp_memcontrol.c | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 3535bffa45f3..6c1c8bc93412 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -202,11 +202,6 @@ EXPORT_SYMBOL(sk_net_capable); static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -#if defined(CONFIG_MEMCG_KMEM) -struct static_key memcg_socket_limit_enabled; -EXPORT_SYMBOL(memcg_socket_limit_enabled); -#endif - /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index e5078259cbe3..9a22e2dfd64a 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -34,7 +34,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) return; if (memcg->tcp_mem.active) - static_key_slow_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_sockets_enabled_key); } static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) @@ -65,7 +65,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) * because when this value change, the code to process it is not * patched in yet. */ - static_key_slow_inc(&memcg_socket_limit_enabled); + static_key_slow_inc(&memcg_sockets_enabled_key); memcg->tcp_mem.active = true; } -- cgit v1.2.3 From ef12947c9c5a96af549c49f10e5503f0612a397c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:34 -0800 Subject: mm: memcontrol: switch to the updated jump-label API According to the direct use of struct static_key is deprecated. Update the socket and slab accounting code accordingly. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reported-by: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/ipv4/tcp_memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 9a22e2dfd64a..18bc7f745e9c 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -34,7 +34,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) return; if (memcg->tcp_mem.active) - static_key_slow_dec(&memcg_sockets_enabled_key); + static_branch_dec(&memcg_sockets_enabled_key); } static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) @@ -65,7 +65,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) * because when this value change, the code to process it is not * patched in yet. */ - static_key_slow_inc(&memcg_sockets_enabled_key); + static_branch_inc(&memcg_sockets_enabled_key); memcg->tcp_mem.active = true; } -- cgit v1.2.3 From 9207f9d45b0ad071baa128e846d7e7ed85016df3 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 8 Jan 2016 15:21:46 +0300 Subject: net: preserve IP control block during GSO segmentation Skb_gso_segment() uses skb control block during segmentation. This patch adds 32-bytes room for previous control block which will be copied into all resulting segments. This patch fixes kernel crash during fragmenting forwarded packets. Fragmentation requires valid IP CB in skb for clearing ip options. Also patch removes custom save/restore in ovs code, now it's redundant. Signed-off-by: Konstantin Khlebnikov Link: http://lkml.kernel.org/r/CALYGNiP-0MZ-FExV2HutTvE9U-QQtkKSoE--KN=JQE5STYsjAA@mail.gmail.com Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++++ net/ipv4/ip_output.c | 1 + net/openvswitch/datapath.c | 5 +---- net/xfrm/xfrm_output.c | 2 ++ 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0ca95d5d7af0..cc9e3652cf93 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2695,6 +2695,8 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) @@ -2709,6 +2711,9 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, return ERR_PTR(err); } + BUILD_BUG_ON(SKB_SGO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 512a44778cf2..64878efa045c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -239,6 +239,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, * from host network stack. */ features = netif_skb_features(skb); + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 91a8b004dc51..deadfdab1bc3 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -336,12 +336,10 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; - struct ovs_skb_cb ovs_cb; int err; - ovs_cb = *OVS_CB(skb); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); - *OVS_CB(skb) = ovs_cb; if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) @@ -359,7 +357,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, /* Queue all of the segments. */ skb = segs; do { - *OVS_CB(skb) = ovs_cb; if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index cc3676eb6239..ff4a91fcab9f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -167,6 +167,8 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb { struct sk_buff *segs; + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, 0); kfree_skb(skb); if (IS_ERR(segs)) -- cgit v1.2.3 From 65a5124a71e85c35fa8d047a471950325855dccf Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 14 Jan 2016 13:49:34 +0800 Subject: sctp: support to lookup with ep+paddr in transport rhashtable Now, when we sendmsg, we translate the ep to laddr by selecting the first element of the list, and then do a lookup for a transport. But sctp_hash_cmp() will compare it against asoc addr_list, which may be a subset of ep addr_list, meaning that this chosen laddr may not be there, and thus making it impossible to find the transport. So we fix it by using ep + paddr to lookup transports in hashtable. In sctp_hash_cmp, if .ep is set, we will check if this ep == asoc->ep, or we will do the laddr check. Fixes: d6c0256a60e6 ("sctp: add the rhashtable apis for sctp global transport hashtable") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Reported-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/input.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index d9a6e66c5c8a..b9a536b52da2 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -784,6 +784,7 @@ hit: /* rhashtable for transport */ struct sctp_hash_cmp_arg { + const struct sctp_endpoint *ep; const union sctp_addr *laddr; const union sctp_addr *paddr; const struct net *net; @@ -797,15 +798,20 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, struct sctp_association *asoc = t->asoc; const struct net *net = x->net; - if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) - return 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) return 1; if (!net_eq(sock_net(asoc->base.sk), net)) return 1; - if (!sctp_bind_addr_match(&asoc->base.bind_addr, - x->laddr, sctp_sk(asoc->base.sk))) - return 1; + if (x->ep) { + if (x->ep != asoc->ep) + return 1; + } else { + if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) + return 1; + if (!sctp_bind_addr_match(&asoc->base.bind_addr, + x->laddr, sctp_sk(asoc->base.sk))) + return 1; + } return 0; } @@ -832,9 +838,11 @@ static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed) const struct sctp_hash_cmp_arg *x = data; const union sctp_addr *paddr = x->paddr; const struct net *net = x->net; - u16 lport = x->laddr->v4.sin_port; + u16 lport; u32 addr; + lport = x->ep ? htons(x->ep->base.bind_addr.port) : + x->laddr->v4.sin_port; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else @@ -864,12 +872,9 @@ void sctp_transport_hashtable_destroy(void) void sctp_hash_transport(struct sctp_transport *t) { - struct sctp_sockaddr_entry *addr; struct sctp_hash_cmp_arg arg; - addr = list_entry(t->asoc->base.bind_addr.address_list.next, - struct sctp_sockaddr_entry, list); - arg.laddr = &addr->a; + arg.ep = t->asoc->ep; arg.paddr = &t->ipaddr; arg.net = sock_net(t->asoc->base.sk); @@ -891,6 +896,7 @@ struct sctp_transport *sctp_addrs_lookup_transport( const union sctp_addr *paddr) { struct sctp_hash_cmp_arg arg = { + .ep = NULL, .laddr = laddr, .paddr = paddr, .net = net, @@ -904,13 +910,15 @@ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { - struct sctp_sockaddr_entry *addr; struct net *net = sock_net(ep->base.sk); + struct sctp_hash_cmp_arg arg = { + .ep = ep, + .paddr = paddr, + .net = net, + }; - addr = list_entry(ep->base.bind_addr.address_list.next, - struct sctp_sockaddr_entry, list); - - return sctp_addrs_lookup_transport(net, &addr->a, paddr); + return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg, + sctp_hash_params); } /* Look up an association. */ -- cgit v1.2.3 From 34ae6a1aa0540f0f781dd265366036355fdc8930 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Jan 2016 04:56:56 -0800 Subject: ipv6: update skb->csum when CE mark is propagated When a tunnel decapsulates the outer header, it has to comply with RFC 6080 and eventually propagate CE mark into inner header. It turns out IP6_ECN_set_ce() does not correctly update skb->csum for CHECKSUM_COMPLETE packets, triggering infamous "hw csum failure" messages and stack traces. Signed-off-by: Eric Dumazet Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/xfrm6_mode_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index f7fbdbabe50e..372855eeaf42 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -23,7 +23,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) - IP6_ECN_set_ce(inner_iph); + IP6_ECN_set_ce(skb, inner_iph); } /* Add encapsulation header. -- cgit v1.2.3 From fb3311853c0f23391fc3441d49a46d076de57757 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 15 Jan 2016 14:44:31 +0100 Subject: net: sctp: Move sequence start handling into sctp_transport_get_idx() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/sctp/proc.c: In function ‘sctp_transport_get_idx’: net/sctp/proc.c:313: warning: ‘obj’ may be used uninitialized in this function This is currently a false positive, as all callers check for a zero offset first, and handle this case in the exact same way. Move the check and handling into sctp_transport_get_idx() to kill the compiler warning, and avoid future bugs. Signed-off-by: Geert Uytterhoeven Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/proc.c b/net/sctp/proc.c index dfa7eeccb537..684c5b31563b 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -310,7 +310,7 @@ static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq) static struct sctp_transport *sctp_transport_get_idx(struct seq_file *seq, loff_t pos) { - void *obj; + void *obj = SEQ_START_TOKEN; while (pos && (obj = sctp_transport_get_next(seq)) && !IS_ERR(obj)) pos--; @@ -347,7 +347,7 @@ static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) if (err) return ERR_PTR(err); - return *pos ? sctp_transport_get_idx(seq, *pos) : SEQ_START_TOKEN; + return sctp_transport_get_idx(seq, *pos); } static void sctp_assocs_seq_stop(struct seq_file *seq, void *v) @@ -462,7 +462,7 @@ static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos) if (err) return ERR_PTR(err); - return *pos ? sctp_transport_get_idx(seq, *pos) : SEQ_START_TOKEN; + return sctp_transport_get_idx(seq, *pos); } static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos) -- cgit v1.2.3 From c6894dec8ea9ae05747124dce98b3b5c2e69b168 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 15 Jan 2016 19:03:54 +0100 Subject: bridge: fix lockdep addr_list_lock false positive splat After promisc mode management was introduced a bridge device could do dev_set_promiscuity from its ndo_change_rx_flags() callback which in turn can be called after the bridge's addr_list_lock has been taken (e.g. by dev_uc_add). This causes a false positive lockdep splat because the port interfaces' addr_list_lock is taken when br_manage_promisc() runs after the bridge's addr list lock was already taken. To remove the false positive introduce a custom bridge addr_list_lock class and set it on bridge init. A simple way to reproduce this is with the following: $ brctl addbr br0 $ ip l add l br0 br0.100 type vlan id 100 $ ip l set br0 up $ ip l set br0.100 up $ echo 1 > /sys/class/net/br0/bridge/vlan_filtering $ brctl addif br0 eth0 Splat: [ 43.684325] ============================================= [ 43.684485] [ INFO: possible recursive locking detected ] [ 43.684636] 4.4.0-rc8+ #54 Not tainted [ 43.684755] --------------------------------------------- [ 43.684906] brctl/1187 is trying to acquire lock: [ 43.685047] (_xmit_ETHER){+.....}, at: [] dev_set_rx_mode+0x1e/0x40 [ 43.685460] but task is already holding lock: [ 43.685618] (_xmit_ETHER){+.....}, at: [] dev_uc_add+0x27/0x80 [ 43.686015] other info that might help us debug this: [ 43.686316] Possible unsafe locking scenario: [ 43.686743] CPU0 [ 43.686967] ---- [ 43.687197] lock(_xmit_ETHER); [ 43.687544] lock(_xmit_ETHER); [ 43.687886] *** DEADLOCK *** [ 43.688438] May be due to missing lock nesting notation [ 43.688882] 2 locks held by brctl/1187: [ 43.689134] #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x17/0x20 [ 43.689852] #1: (_xmit_ETHER){+.....}, at: [] dev_uc_add+0x27/0x80 [ 43.690575] stack backtrace: [ 43.690970] CPU: 0 PID: 1187 Comm: brctl Not tainted 4.4.0-rc8+ #54 [ 43.691270] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 [ 43.691770] ffffffff826a25c0 ffff8800369fb8e0 ffffffff81360ceb ffffffff826a25c0 [ 43.692425] ffff8800369fb9b8 ffffffff810d0466 ffff8800369fb968 ffffffff81537139 [ 43.693071] ffff88003a08c880 0000000000000000 00000000ffffffff 0000000002080020 [ 43.693709] Call Trace: [ 43.693931] [] dump_stack+0x4b/0x70 [ 43.694199] [] __lock_acquire+0x1e46/0x1e90 [ 43.694483] [] ? netlink_broadcast_filtered+0x139/0x3e0 [ 43.694789] [] ? nlmsg_notify+0x5a/0xc0 [ 43.695064] [] lock_acquire+0xe5/0x1f0 [ 43.695340] [] ? dev_set_rx_mode+0x1e/0x40 [ 43.695623] [] _raw_spin_lock_bh+0x45/0x80 [ 43.695901] [] ? dev_set_rx_mode+0x1e/0x40 [ 43.696180] [] dev_set_rx_mode+0x1e/0x40 [ 43.696460] [] dev_set_promiscuity+0x3c/0x50 [ 43.696750] [] br_port_set_promisc+0x25/0x50 [bridge] [ 43.697052] [] br_manage_promisc+0x8a/0xe0 [bridge] [ 43.697348] [] br_dev_change_rx_flags+0x1e/0x20 [bridge] [ 43.697655] [] __dev_set_promiscuity+0x132/0x1f0 [ 43.697943] [] __dev_set_rx_mode+0x82/0x90 [ 43.698223] [] dev_uc_add+0x5e/0x80 [ 43.698498] [] vlan_device_event+0x542/0x650 [8021q] [ 43.698798] [] notifier_call_chain+0x5d/0x80 [ 43.699083] [] raw_notifier_call_chain+0x16/0x20 [ 43.699374] [] call_netdevice_notifiers_info+0x6e/0x80 [ 43.699678] [] call_netdevice_notifiers+0x16/0x20 [ 43.699973] [] br_add_if+0x47e/0x4c0 [bridge] [ 43.700259] [] add_del_if+0x6e/0x80 [bridge] [ 43.700548] [] br_dev_ioctl+0xaf/0xc0 [bridge] [ 43.700836] [] dev_ifsioc+0x30c/0x3c0 [ 43.701106] [] dev_ioctl+0xf9/0x6f0 [ 43.701379] [] ? mntput_no_expire+0x5/0x450 [ 43.701665] [] ? mntput_no_expire+0xae/0x450 [ 43.701947] [] sock_do_ioctl+0x42/0x50 [ 43.702219] [] sock_ioctl+0x1e5/0x290 [ 43.702500] [] do_vfs_ioctl+0x2cb/0x5c0 [ 43.702771] [] SyS_ioctl+0x79/0x90 [ 43.703033] [] entry_SYSCALL_64_fastpath+0x16/0x7a CC: Vlad Yasevich CC: Stephen Hemminger CC: Bridge list CC: Andy Gospodarek CC: Roopa Prabhu Fixes: 2796d0c648c9 ("bridge: Automatically manage port promiscuous mode.") Reported-by: Andy Gospodarek Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_device.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 5e88d3e17546..2c8095a5d824 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -28,6 +28,8 @@ const struct nf_br_ops __rcu *nf_br_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_br_ops); +static struct lock_class_key bridge_netdev_addr_lock_key; + /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -87,6 +89,11 @@ out: return NETDEV_TX_OK; } +static void br_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); +} + static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -99,6 +106,7 @@ static int br_dev_init(struct net_device *dev) err = br_vlan_init(br); if (err) free_percpu(br->stats); + br_set_lockdep_class(dev); return err; } -- cgit v1.2.3