diff options
Diffstat (limited to 'fs')
136 files changed, 2780 insertions, 1748 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 2bc7ad775842..3ef62bad8f2b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -79,6 +79,7 @@ config EXPORTFS_BLOCK_OPS config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT default y + select PERCPU_RWSEM help This option enables standard file locking support, required for filesystems like NFS and for the flock() system diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c7efddf6e038..4c09d93d9569 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -89,7 +89,7 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" - depends on !MMU || M68K + depends on !MMU || ARM || M68K depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 7ef637d7f3a5..1e9d2f84e5b5 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -461,8 +461,8 @@ static void afs_callback_updater(struct work_struct *work) */ int __init afs_callback_update_init(void) { - afs_callback_update_worker = - create_singlethread_workqueue("kafs_callbackd"); + afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd", + WQ_MEM_RECLAIM); return afs_callback_update_worker ? 0 : -ENOMEM; } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 4b0eff6da674..2037e7a77a37 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -17,19 +17,12 @@ #include "internal.h" #include "afs_cm.h" -#if 0 -struct workqueue_struct *afs_cm_workqueue; -#endif /* 0 */ - -static int afs_deliver_cb_init_call_back_state(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_init_call_back_state3(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *, - struct sk_buff *, bool); +static int afs_deliver_cb_init_call_back_state(struct afs_call *); +static int afs_deliver_cb_init_call_back_state3(struct afs_call *); +static int afs_deliver_cb_probe(struct afs_call *); +static int afs_deliver_cb_callback(struct afs_call *); +static int afs_deliver_cb_probe_uuid(struct afs_call *); +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *); static void afs_cm_destructor(struct afs_call *); /* @@ -134,7 +127,7 @@ static void afs_cm_destructor(struct afs_call *call) * received. The step number here must match the final number in * afs_deliver_cb_callback(). */ - if (call->unmarshall == 6) { + if (call->unmarshall == 5) { ASSERT(call->server && call->count && call->request); afs_break_callbacks(call->server, call->count, call->request); } @@ -168,32 +161,29 @@ static void SRXAFSCB_CallBack(struct work_struct *work) /* * deliver request data to a CB.CallBack call */ -static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_callback(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_callback *cb; struct afs_server *server; - struct in_addr addr; __be32 *bp; u32 tmp; int ret, loop; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); call->offset = 0; call->unmarshall++; /* extract the FID array and its count in two steps */ case 1: _debug("extract FID count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); @@ -208,13 +198,10 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, case 2: _debug("extract FID array"); - ret = afs_extract_data(call, skb, last, call->buffer, - call->count * 3 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, true); + if (ret < 0) + return ret; _debug("unmarshall FID array"); call->request = kcalloc(call->count, @@ -238,12 +225,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, /* extract the callback array and its count in two steps */ case 3: _debug("extract CB count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; tmp = ntohl(call->tmp); _debug("CB count: %u", tmp); @@ -251,18 +235,13 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, return -EBADMSG; call->offset = 0; call->unmarshall++; - if (tmp == 0) - goto empty_cb_array; case 4: _debug("extract CB array"); - ret = afs_extract_data(call, skb, last, call->request, - call->count * 3 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, false); + if (ret < 0) + return ret; _debug("unmarshall CB array"); cb = call->request; @@ -273,15 +252,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, cb->type = ntohl(*bp++); } - empty_cb_array: call->offset = 0; call->unmarshall++; - case 5: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; - /* Record that the message was unmarshalled successfully so * that the call destructor can know do the callback breaking * work, even if the final ACK isn't received. @@ -290,19 +263,15 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, * updated also. */ call->unmarshall++; - case 6: + case 5: break; } - if (!last) - return 0; - call->state = AFS_CALL_REPLYING; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -329,27 +298,26 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) /* * deliver request data to a CB.InitCallBackState call */ -static int afs_deliver_cb_init_call_back_state(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; + int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); + + ret = afs_extract_data(call, NULL, 0, false); + if (ret < 0) + return ret; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -362,25 +330,68 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, /* * deliver request data to a CB.InitCallBackState3 call */ -static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; + struct afs_uuid *r; + unsigned loop; + __be32 *b; + int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); + + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); + + _enter("{%u}", call->unmarshall); - if (!last) - return 0; + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->unmarshall++; + + case 1: + _debug("extract UUID"); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall UUID"); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + b = call->buffer; + r = call->request; + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); + r->clock_seq_hi_and_reserved = ntohl(b[3]); + r->clock_seq_low = ntohl(b[4]); + + for (loop = 0; loop < 6; loop++) + r->node[loop] = ntohl(b[loop + 5]); + + call->offset = 0; + call->unmarshall++; + + case 2: + break; + } /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -405,15 +416,15 @@ static void SRXAFSCB_Probe(struct work_struct *work) /* * deliver request data to a CB.Probe call */ -static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + int ret; + + _enter(""); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_extract_data(call, NULL, 0, false); + if (ret < 0) + return ret; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; @@ -437,7 +448,6 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) _enter(""); - if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0) reply.match = htonl(0); else @@ -450,20 +460,14 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) /* * deliver request data to a CB.ProbeUuid call */ -static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe_uuid(struct afs_call *call) { struct afs_uuid *r; unsigned loop; __be32 *b; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -475,8 +479,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, case 1: _debug("extract UUID"); - ret = afs_extract_data(call, skb, last, call->buffer, - 11 * sizeof(__be32)); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -503,15 +507,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, call->unmarshall++; case 2: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; break; } - if (!last) - return 0; - call->state = AFS_CALL_REPLYING; INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); @@ -585,15 +583,15 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) /* * deliver request data to a CB.TellMeAboutYourself call */ -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + int ret; + + _enter(""); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_extract_data(call, NULL, 0, false); + if (ret < 0) + return ret; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index d91a9c9cfbd0..3191dff2c156 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -36,8 +36,8 @@ static int afs_init_lock_manager(void) if (!afs_lock_manager) { mutex_lock(&afs_lock_manager_mutex); if (!afs_lock_manager) { - afs_lock_manager = - create_singlethread_workqueue("kafs_lockd"); + afs_lock_manager = alloc_workqueue("kafs_lockd", + WQ_MEM_RECLAIM, 0); if (!afs_lock_manager) ret = -ENOMEM; } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index c2e930ec2888..96f4d764d1a6 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -235,20 +235,17 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_status(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; - _enter(",,%u", last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter(""); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -309,8 +306,7 @@ int afs_fs_fetch_file_status(struct afs_server *server, /* * deliver reply data to an FS.FetchData */ -static int afs_deliver_fs_fetch_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; @@ -318,7 +314,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, void *buffer; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -334,12 +330,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, * client) */ case 1: _debug("extract data length (MSW)"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("DATA length MSW: %u", call->count); @@ -352,12 +345,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the returned data length */ case 2: _debug("extract data length"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("DATA length: %u", call->count); @@ -371,15 +361,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, _debug("extract data"); if (call->count > 0) { page = call->reply3; - buffer = kmap_atomic(page); - ret = afs_extract_data(call, skb, last, buffer, - call->count); - kunmap_atomic(buffer); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + buffer = kmap(page); + ret = afs_extract_data(call, buffer, + call->count, true); + kunmap(buffer); + if (ret < 0) + return ret; } call->offset = 0; @@ -387,13 +374,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the metadata */ case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - (21 + 3 + 6) * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->buffer, + (21 + 3 + 6) * 4, false); + if (ret < 0) + return ret; bp = call->buffer; xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); @@ -405,21 +389,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, call->unmarshall++; case 5: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; break; } - if (!last) - return 0; - if (call->count < PAGE_SIZE) { _debug("clear"); page = call->reply3; - buffer = kmap_atomic(page); + buffer = kmap(page); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap_atomic(buffer); + kunmap(buffer); } _leave(" = 0 [done]"); @@ -532,14 +510,12 @@ int afs_fs_fetch_data(struct afs_server *server, /* * deliver reply data to an FS.GiveUpCallBacks */ -static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_give_up_callbacks(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + _enter(""); - if (skb->len > 0) - return -EBADMSG; /* shouldn't be any reply data */ - return 0; + /* shouldn't be any reply data */ + return afs_extract_data(call, NULL, 0, false); } /* @@ -617,20 +593,17 @@ int afs_fs_give_up_callbacks(struct afs_server *server, /* * deliver reply data to an FS.CreateFile or an FS.MakeDir */ -static int afs_deliver_fs_create_vnode(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_create_vnode(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter("{%u}", call->unmarshall); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -716,20 +689,17 @@ int afs_fs_create(struct afs_server *server, /* * deliver reply data to an FS.RemoveFile or FS.RemoveDir */ -static int afs_deliver_fs_remove(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_remove(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter("{%u}", call->unmarshall); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -799,20 +769,17 @@ int afs_fs_remove(struct afs_server *server, /* * deliver reply data to an FS.Link */ -static int afs_deliver_fs_link(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_link(struct afs_call *call) { struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter("{%u}", call->unmarshall); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -887,20 +854,17 @@ int afs_fs_link(struct afs_server *server, /* * deliver reply data to an FS.Symlink */ -static int afs_deliver_fs_symlink(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_symlink(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter("{%u}", call->unmarshall); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -994,20 +958,17 @@ int afs_fs_symlink(struct afs_server *server, /* * deliver reply data to an FS.Rename */ -static int afs_deliver_fs_rename(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_rename(struct afs_call *call) { struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -1100,25 +1061,17 @@ int afs_fs_rename(struct afs_server *server, /* * deliver reply data to an FS.StoreData */ -static int afs_deliver_fs_store_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; - _enter(",,%u", last); - - afs_transfer_reply(call, skb); - if (!last) { - _leave(" = 0 [more]"); - return 0; - } + _enter(""); - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -1286,26 +1239,18 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, /* * deliver reply data to an FS.StoreStatus */ -static int afs_deliver_fs_store_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_status(struct afs_call *call) { afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; - _enter(",,%u", last); - - afs_transfer_reply(call, skb); - if (!last) { - _leave(" = 0 [more]"); - return 0; - } + _enter(""); - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ store_version = NULL; @@ -1485,14 +1430,13 @@ int afs_fs_setattr(struct afs_server *server, struct key *key, /* * deliver reply data to an FS.GetVolumeStatus */ -static int afs_deliver_fs_get_volume_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_get_volume_status(struct afs_call *call) { const __be32 *bp; char *p; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -1502,13 +1446,10 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the returned status record */ case 1: _debug("extract status"); - ret = afs_extract_data(call, skb, last, call->buffer, - 12 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->buffer, + 12 * 4, true); + if (ret < 0) + return ret; bp = call->buffer; xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2); @@ -1517,12 +1458,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the volume name length */ case 2: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); @@ -1535,13 +1473,10 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 3: _debug("extract volname"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->reply3, + call->count, true); + if (ret < 0) + return ret; } p = call->reply3; @@ -1559,13 +1494,10 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->buffer, + call->count, true); + if (ret < 0) + return ret; call->offset = 0; call->unmarshall++; @@ -1573,12 +1505,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the offline message length */ case 5: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); @@ -1591,13 +1520,10 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 6: _debug("extract offline"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->reply3, + call->count, true); + if (ret < 0) + return ret; } p = call->reply3; @@ -1615,13 +1541,10 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 7: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->buffer, + call->count, true); + if (ret < 0) + return ret; call->offset = 0; call->unmarshall++; @@ -1629,12 +1552,9 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the message of the day length */ case 8: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); @@ -1647,13 +1567,10 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 9: _debug("extract motd"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->reply3, + call->count, true); + if (ret < 0) + return ret; } p = call->reply3; @@ -1664,35 +1581,20 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->unmarshall++; /* extract the message of the day padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_motd_padding; - } - call->count = 4 - (call->count & 3); + call->count = (4 - (call->count & 3)) & 3; case 10: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, call->buffer, + call->count, false); + if (ret < 0) + return ret; call->offset = 0; call->unmarshall++; - no_motd_padding: - case 11: - _debug("trailer %d", skb->len); - if (skb->len != 0) - return -EBADMSG; break; } - if (!last) - return 0; - _leave(" = 0 [done]"); return 0; } @@ -1760,19 +1662,16 @@ int afs_fs_get_volume_status(struct afs_server *server, /* * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock */ -static int afs_deliver_fs_xxxx_lock(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_xxxx_lock(struct afs_call *call) { const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter("{%u}", call->unmarshall); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 71d5982312f3..5497c8496055 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -13,13 +13,13 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/skbuff.h> #include <linux/rxrpc.h> #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> +#include <net/af_rxrpc.h> #include "afs.h" #include "afs_vl.h" @@ -56,7 +56,7 @@ struct afs_mount_params { */ struct afs_wait_mode { /* RxRPC received message notification */ - void (*rx_wakeup)(struct afs_call *call); + rxrpc_notify_rx_t notify_rx; /* synchronous call waiter and call dispatched notification */ int (*wait)(struct afs_call *call); @@ -75,10 +75,8 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ - struct sk_buff_head rx_queue; /* received packets */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct key *key; /* security for this call */ struct afs_server *server; /* server affected by incoming CM call */ @@ -92,6 +90,7 @@ struct afs_call { void *reply4; /* reply buffer (fourth part) */ pgoff_t first; /* first page in mapping to deal with */ pgoff_t last; /* last page in mapping to deal with */ + size_t offset; /* offset into received data store */ enum { /* call state */ AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ @@ -99,21 +98,18 @@ struct afs_call { AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ AFS_CALL_REPLYING, /* replying to incoming call */ AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ - AFS_CALL_COMPLETE, /* successfully completed */ - AFS_CALL_BUSY, /* server was busy */ - AFS_CALL_ABORTED, /* call was aborted */ - AFS_CALL_ERROR, /* call failed due to error */ + AFS_CALL_COMPLETE, /* Completed or failed */ } state; int error; /* error code */ + u32 abort_code; /* Remote abort ID or 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ - unsigned reply_size; /* current size of reply */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ - unsigned offset; /* offset into received data store */ unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ + bool need_attention; /* T if RxRPC poked us */ u16 service_id; /* RxRPC service ID to call */ __be16 port; /* target UDP port */ __be32 operation_ID; /* operation ID for an incoming call */ @@ -128,8 +124,7 @@ struct afs_call_type { /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ - int (*deliver)(struct afs_call *call, struct sk_buff *skb, - bool last); + int (*deliver)(struct afs_call *call); /* map an abort code to an error number */ int (*abort_to_error)(u32 abort_code); @@ -607,6 +602,8 @@ extern void afs_proc_cell_remove(struct afs_cell *); /* * rxrpc.c */ +extern struct socket *afs_socket; + extern int afs_open_socket(void); extern void afs_close_socket(void); extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, @@ -614,11 +611,14 @@ extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); -extern void afs_transfer_reply(struct afs_call *, struct sk_buff *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); -extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, - size_t); +extern int afs_extract_data(struct afs_call *, void *, size_t, bool); + +static inline int afs_transfer_reply(struct afs_call *call) +{ + return afs_extract_data(call, call->buffer, call->reply_max, false); +} /* * security.c @@ -642,7 +642,7 @@ do { \ extern struct afs_server *afs_lookup_server(struct afs_cell *, const struct in_addr *); -extern struct afs_server *afs_find_server(const struct in_addr *); +extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *); extern void afs_put_server(struct afs_server *); extern void __exit afs_purge_servers(void); diff --git a/fs/afs/main.c b/fs/afs/main.c index 35de0c04729f..0b187ef3b5b7 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/completion.h> #include <linux/sched.h> +#include <linux/random.h> #include "internal.h" MODULE_DESCRIPTION("AFS Client File System"); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 4832de84d52c..59bdaa7527b6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -16,34 +16,36 @@ #include "internal.h" #include "afs_cm.h" -static struct socket *afs_socket; /* my RxRPC socket */ +struct socket *afs_socket; /* my RxRPC socket */ static struct workqueue_struct *afs_async_calls; +static struct afs_call *afs_spare_incoming_call; static atomic_t afs_outstanding_calls; -static atomic_t afs_outstanding_skbs; -static void afs_wake_up_call_waiter(struct afs_call *); +static void afs_free_call(struct afs_call *); +static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static int afs_wait_for_call_to_complete(struct afs_call *); -static void afs_wake_up_async_call(struct afs_call *); +static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct afs_call *); -static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); -static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); +static void afs_process_async_call(struct work_struct *); +static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static int afs_deliver_cm_op_id(struct afs_call *); /* synchronous call management */ const struct afs_wait_mode afs_sync_call = { - .rx_wakeup = afs_wake_up_call_waiter, + .notify_rx = afs_wake_up_call_waiter, .wait = afs_wait_for_call_to_complete, }; /* asynchronous call management */ const struct afs_wait_mode afs_async_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, .wait = afs_dont_wait_for_call_to_complete, }; /* asynchronous incoming call management */ static const struct afs_wait_mode afs_async_incoming_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, }; /* asynchronous incoming call initial processing */ @@ -53,17 +55,9 @@ static const struct afs_call_type afs_RXCMxxxx = { .abort_to_error = afs_abort_to_error, }; -static void afs_collect_incoming_call(struct work_struct *); +static void afs_charge_preallocation(struct work_struct *); -static struct sk_buff_head afs_incoming_calls; -static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); - -static void afs_async_workfn(struct work_struct *work) -{ - struct afs_call *call = container_of(work, struct afs_call, async_work); - - call->async_workfn(call); -} +static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation); static int afs_wait_atomic_t(atomic_t *p) { @@ -83,10 +77,8 @@ int afs_open_socket(void) _enter(""); - skb_queue_head_init(&afs_incoming_calls); - ret = -ENOMEM; - afs_async_calls = create_singlethread_workqueue("kafsd"); + afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); if (!afs_async_calls) goto error_0; @@ -110,13 +102,15 @@ int afs_open_socket(void) if (ret < 0) goto error_2; + rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, + afs_rx_discard_new_call); + ret = kernel_listen(socket, INT_MAX); if (ret < 0) goto error_2; - rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); - afs_socket = socket; + afs_charge_preallocation(NULL); _leave(" = 0"); return 0; @@ -136,55 +130,28 @@ void afs_close_socket(void) { _enter(""); + if (afs_spare_incoming_call) { + atomic_inc(&afs_outstanding_calls); + afs_free_call(afs_spare_incoming_call); + afs_spare_incoming_call = NULL; + } + + _debug("outstanding %u", atomic_read(&afs_outstanding_calls)); wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t, TASK_UNINTERRUPTIBLE); _debug("no outstanding calls"); + flush_workqueue(afs_async_calls); + kernel_sock_shutdown(afs_socket, SHUT_RDWR); + flush_workqueue(afs_async_calls); sock_release(afs_socket); _debug("dework"); destroy_workqueue(afs_async_calls); - - ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0); _leave(""); } /* - * note that the data in a socket buffer is now delivered and that the buffer - * should be freed - */ -static void afs_data_delivered(struct sk_buff *skb) -{ - if (!skb) { - _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("DLVR %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_data_delivered(skb); - } -} - -/* - * free a socket buffer - */ -static void afs_free_skb(struct sk_buff *skb) -{ - if (!skb) { - _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("FREE %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_free_skb(skb); - } -} - -/* * free a call */ static void afs_free_call(struct afs_call *call) @@ -194,7 +161,6 @@ static void afs_free_call(struct afs_call *call) ASSERTCMP(call->rxcall, ==, NULL); ASSERT(!work_pending(&call->async_work)); - ASSERT(skb_queue_empty(&call->rx_queue)); ASSERT(call->type->name != NULL); kfree(call->request); @@ -210,7 +176,7 @@ static void afs_free_call(struct afs_call *call) static void afs_end_call_nofree(struct afs_call *call) { if (call->rxcall) { - rxrpc_kernel_end_call(call->rxcall); + rxrpc_kernel_end_call(afs_socket, call->rxcall); call->rxcall = NULL; } if (call->type->destructor) @@ -230,7 +196,7 @@ static void afs_end_call(struct afs_call *call) * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, - size_t request_size, size_t reply_size) + size_t request_size, size_t reply_max) { struct afs_call *call; @@ -244,7 +210,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, call->type = type; call->request_size = request_size; - call->reply_max = reply_size; + call->reply_max = reply_max; if (request_size) { call->request = kmalloc(request_size, GFP_NOFS); @@ -252,14 +218,13 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, goto nomem_free; } - if (reply_size) { - call->buffer = kmalloc(reply_size, GFP_NOFS); + if (reply_max) { + call->buffer = kmalloc(reply_max, GFP_NOFS); if (!call->buffer) goto nomem_free; } init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); return call; nomem_free: @@ -328,8 +293,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, * returns from sending the request */ if (first + loop >= last) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(call->rxcall, msg, - to - offset); + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, + msg, to - offset); kunmap(pages[loop]); if (ret < 0) break; @@ -357,7 +322,6 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; int ret; - struct sk_buff *skb; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -369,8 +333,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -383,7 +346,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, /* create a call */ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, - (unsigned long) call, gfp); + (unsigned long) call, gfp, + wait_mode->notify_rx); call->key = NULL; if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); @@ -409,7 +373,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, * request */ if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size); + ret = rxrpc_kernel_send_data(afs_socket, rxcall, + &msg, call->request_size); if (ret < 0) goto error_do_abort; @@ -424,9 +389,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, return wait_mode->wait(call); error_do_abort: - rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD"); error_kill_call: afs_end_call(call); _leave(" = %d", ret); @@ -434,135 +397,77 @@ error_kill_call: } /* - * Handles intercepted messages that were arriving in the socket's Rx queue. - * - * Called from the AF_RXRPC call processor in waitqueue process context. For - * each call, it is guaranteed this will be called in order of packet to be - * delivered. - */ -static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, - struct sk_buff *skb) -{ - struct afs_call *call = (struct afs_call *) user_call_ID; - - _enter("%p,,%u", call, skb->mark); - - _debug("ICPT %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - - ASSERTCMP(sk, ==, afs_socket->sk); - atomic_inc(&afs_outstanding_skbs); - - if (!call) { - /* its an incoming call for our callback service */ - skb_queue_tail(&afs_incoming_calls, skb); - queue_work(afs_wq, &afs_collect_incoming_call_work); - } else { - /* route the messages directly to the appropriate call */ - skb_queue_tail(&call->rx_queue, skb); - call->wait_mode->rx_wakeup(call); - } - - _leave(""); -} - -/* * deliver messages to a call */ static void afs_deliver_to_call(struct afs_call *call) { - struct sk_buff *skb; - bool last; u32 abort_code; int ret; - _enter(""); - - while ((call->state == AFS_CALL_AWAIT_REPLY || - call->state == AFS_CALL_AWAIT_OP_ID || - call->state == AFS_CALL_AWAIT_REQUEST || - call->state == AFS_CALL_AWAIT_ACK) && - (skb = skb_dequeue(&call->rx_queue))) { - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: - _debug("Rcv DATA"); - last = rxrpc_kernel_is_data_last(skb); - ret = call->type->deliver(call, skb, last); - switch (ret) { - case 0: - if (last && - call->state == AFS_CALL_AWAIT_REPLY) - call->state = AFS_CALL_COMPLETE; - break; - case -ENOTCONN: - abort_code = RX_CALL_DEAD; - goto do_abort; - case -ENOTSUPP: - abort_code = RX_INVALID_OPERATION; - goto do_abort; - default: - abort_code = RXGEN_CC_UNMARSHAL; - if (call->state != AFS_CALL_AWAIT_REPLY) - abort_code = RXGEN_SS_UNMARSHAL; - do_abort: - rxrpc_kernel_abort_call(call->rxcall, - abort_code); - call->error = ret; - call->state = AFS_CALL_ERROR; - break; + _enter("%s", call->type->name); + + while (call->state == AFS_CALL_AWAIT_REPLY || + call->state == AFS_CALL_AWAIT_OP_ID || + call->state == AFS_CALL_AWAIT_REQUEST || + call->state == AFS_CALL_AWAIT_ACK + ) { + if (call->state == AFS_CALL_AWAIT_ACK) { + size_t offset = 0; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + NULL, 0, &offset, false, + &call->abort_code); + if (ret == -EINPROGRESS || ret == -EAGAIN) + return; + if (ret == 1) { + call->state = AFS_CALL_COMPLETE; + goto done; } - afs_data_delivered(skb); - skb = NULL; - continue; - case RXRPC_SKB_MARK_FINAL_ACK: - _debug("Rcv ACK"); - call->state = AFS_CALL_COMPLETE; - break; - case RXRPC_SKB_MARK_BUSY: - _debug("Rcv BUSY"); - call->error = -EBUSY; - call->state = AFS_CALL_BUSY; - break; - case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Rcv ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Loc ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_NET_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv NET ERROR %d", call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv LOCAL ERROR %d", call->error); - break; - default: - BUG(); - break; + return; } - afs_free_skb(skb); - } - - /* make sure the queue is empty if the call is done with (we might have - * aborted the call early because of an unmarshalling error) */ - if (call->state >= AFS_CALL_COMPLETE) { - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); - if (call->incoming) - afs_end_call(call); + ret = call->type->deliver(call); + switch (ret) { + case 0: + if (call->state == AFS_CALL_AWAIT_REPLY) + call->state = AFS_CALL_COMPLETE; + goto done; + case -EINPROGRESS: + case -EAGAIN: + goto out; + case -ENOTCONN: + abort_code = RX_CALL_DEAD; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, -ret, "KNC"); + goto do_abort; + case -ENOTSUPP: + abort_code = RX_INVALID_OPERATION; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, -ret, "KIV"); + goto do_abort; + case -ENODATA: + case -EBADMSG: + case -EMSGSIZE: + default: + abort_code = RXGEN_CC_UNMARSHAL; + if (call->state != AFS_CALL_AWAIT_REPLY) + abort_code = RXGEN_SS_UNMARSHAL; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, EBADMSG, "KUM"); + goto do_abort; + } } +done: + if (call->state == AFS_CALL_COMPLETE && call->incoming) + afs_end_call(call); +out: _leave(""); + return; + +do_abort: + call->error = ret; + call->state = AFS_CALL_COMPLETE; + goto done; } /* @@ -570,7 +475,7 @@ static void afs_deliver_to_call(struct afs_call *call) */ static int afs_wait_for_call_to_complete(struct afs_call *call) { - struct sk_buff *skb; + const char *abort_why; int ret; DECLARE_WAITQUEUE(myself, current); @@ -582,15 +487,18 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) set_current_state(TASK_INTERRUPTIBLE); /* deliver any messages that are in the queue */ - if (!skb_queue_empty(&call->rx_queue)) { + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); continue; } + abort_why = "KWC"; ret = call->error; - if (call->state >= AFS_CALL_COMPLETE) + if (call->state == AFS_CALL_COMPLETE) break; + abort_why = "KWI"; ret = -EINTR; if (signal_pending(current)) break; @@ -603,9 +511,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* kill the call */ if (call->state < AFS_CALL_COMPLETE) { _debug("call incomplete"); - rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_CALL_DEAD, -ret, abort_why); } _debug("call complete"); @@ -617,17 +524,24 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* * wake up a waiting call */ -static void afs_wake_up_call_waiter(struct afs_call *call) +static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; wake_up(&call->waitq); } /* * wake up an asynchronous call */ -static void afs_wake_up_async_call(struct afs_call *call) +static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { - _enter(""); + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; queue_work(afs_async_calls, &call->async_work); } @@ -645,8 +559,10 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct afs_call *call) +static void afs_delete_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); afs_free_call(call); @@ -656,17 +572,19 @@ static void afs_delete_async_call(struct afs_call *call) /* * perform processing on an asynchronous call - * - on a multiple-thread workqueue this work item may try to run on several - * CPUs at the same time */ -static void afs_process_async_call(struct afs_call *call) +static void afs_process_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); - if (!skb_queue_empty(&call->rx_queue)) + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; afs_deliver_to_call(call); + } - if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) { + if (call->state == AFS_CALL_COMPLETE && call->wait_mode) { if (call->wait_mode->async_complete) call->wait_mode->async_complete(call->reply, call->error); @@ -677,104 +595,93 @@ static void afs_process_async_call(struct afs_call *call) /* we can't just delete the call because the work item may be * queued */ - call->async_workfn = afs_delete_async_call; + call->async_work.func = afs_delete_async_call; queue_work(afs_async_calls, &call->async_work); } _leave(""); } -/* - * empty a socket buffer into a flat reply buffer - */ -void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) { - size_t len = skb->len; + struct afs_call *call = (struct afs_call *)user_call_ID; - if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0) - BUG(); - call->reply_size += len; + call->rxcall = rxcall; } /* - * accept the backlog of incoming calls + * Charge the incoming call preallocation. */ -static void afs_collect_incoming_call(struct work_struct *work) +static void afs_charge_preallocation(struct work_struct *work) { - struct rxrpc_call *rxcall; - struct afs_call *call = NULL; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&afs_incoming_calls))) { - _debug("new call"); - - /* don't need the notification */ - afs_free_skb(skb); + struct afs_call *call = afs_spare_incoming_call; + for (;;) { if (!call) { call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); - if (!call) { - rxrpc_kernel_reject_call(afs_socket); - return; - } + if (!call) + break; - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); call->wait_mode = &afs_async_incoming_call; call->type = &afs_RXCMxxxx; init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); call->state = AFS_CALL_AWAIT_OP_ID; - - _debug("CALL %p{%s} [%d]", - call, call->type->name, - atomic_read(&afs_outstanding_calls)); - atomic_inc(&afs_outstanding_calls); } - rxcall = rxrpc_kernel_accept_call(afs_socket, - (unsigned long) call); - if (!IS_ERR(rxcall)) { - call->rxcall = rxcall; - call = NULL; - } + if (rxrpc_kernel_charge_accept(afs_socket, + afs_wake_up_async_call, + afs_rx_attach, + (unsigned long)call, + GFP_KERNEL) < 0) + break; + call = NULL; } + afs_spare_incoming_call = call; +} - if (call) - afs_free_call(call); +/* + * Discard a preallocated call when a socket is shut down. + */ +static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + struct afs_call *call = (struct afs_call *)user_call_ID; + + atomic_inc(&afs_outstanding_calls); + call->rxcall = NULL; + afs_free_call(call); } /* - * grab the operation ID from an incoming cache manager call + * Notification of an incoming call. */ -static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, - bool last) +static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long user_call_ID) { - size_t len = skb->len; - void *oibuf = (void *) &call->operation_ID; + atomic_inc(&afs_outstanding_calls); + queue_work(afs_wq, &afs_charge_preallocation_work); +} - _enter("{%u},{%zu},%d", call->offset, len, last); +/* + * Grab the operation ID from an incoming cache manager call. The socket + * buffer is discarded on error or if we don't yet have sufficient data. + */ +static int afs_deliver_cm_op_id(struct afs_call *call) +{ + int ret; + + _enter("{%zu}", call->offset); ASSERTCMP(call->offset, <, 4); /* the operation ID forms the first four bytes of the request data */ - len = min_t(size_t, len, 4 - call->offset); - if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0) - BUG(); - if (!pskb_pull(skb, len)) - BUG(); - call->offset += len; - - if (call->offset < 4) { - if (last) { - _leave(" = -EBADMSG [op ID short]"); - return -EBADMSG; - } - _leave(" = 0 [incomplete]"); - return 0; - } + ret = afs_extract_data(call, &call->operation_ID, 4, true); + if (ret < 0) + return ret; call->state = AFS_CALL_AWAIT_REQUEST; + call->offset = 0; /* ask the cache manager to route the call (it'll change the call type * if successful) */ @@ -783,7 +690,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, /* pass responsibility for the remainer of this message off to the * cache manager op */ - return call->type->deliver(call, skb, last); + return call->type->deliver(call); } /* @@ -803,14 +710,15 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) { + switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) { case 0: _leave(" [replied]"); return; case -ENOMEM: _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT, ENOMEM, "KOO"); default: afs_end_call(call); _leave(" [error]"); @@ -839,7 +747,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(call->rxcall, &msg, len); + n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len); if (n >= 0) { /* Success */ _leave(" [replied]"); @@ -848,37 +756,50 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT, ENOMEM, "KOO"); } afs_end_call(call); _leave(" [error]"); } /* - * extract a piece of data from the received data socket buffers + * Extract a piece of data from the received data socket buffers. */ -int afs_extract_data(struct afs_call *call, struct sk_buff *skb, - bool last, void *buf, size_t count) +int afs_extract_data(struct afs_call *call, void *buf, size_t count, + bool want_more) { - size_t len = skb->len; + int ret; - _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count); + _enter("{%s,%zu},,%zu,%d", + call->type->name, call->offset, count, want_more); - ASSERTCMP(call->offset, <, count); + ASSERTCMP(call->offset, <=, count); - len = min_t(size_t, len, count - call->offset); - if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 || - !pskb_pull(skb, len)) - BUG(); - call->offset += len; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + buf, count, &call->offset, + want_more, &call->abort_code); + if (ret == 0 || ret == -EAGAIN) + return ret; - if (call->offset < count) { - if (last) { - _leave(" = -EBADMSG [%d < %zu]", call->offset, count); - return -EBADMSG; + if (ret == 1) { + switch (call->state) { + case AFS_CALL_AWAIT_REPLY: + call->state = AFS_CALL_COMPLETE; + break; + case AFS_CALL_AWAIT_REQUEST: + call->state = AFS_CALL_REPLYING; + break; + default: + break; } - _leave(" = -EAGAIN"); - return -EAGAIN; + return 0; } - return 0; + + if (ret == -ECONNABORTED) + call->error = call->type->abort_to_error(call->abort_code); + else + call->error = ret; + call->state = AFS_CALL_COMPLETE; + return ret; } diff --git a/fs/afs/server.c b/fs/afs/server.c index f342acf3547d..d4066ab7dd55 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -178,13 +178,18 @@ server_in_two_cells: /* * look up a server by its IP address */ -struct afs_server *afs_find_server(const struct in_addr *_addr) +struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx) { struct afs_server *server = NULL; struct rb_node *p; - struct in_addr addr = *_addr; + struct in_addr addr = srx->transport.sin.sin_addr; - _enter("%pI4", &addr.s_addr); + _enter("{%d,%pI4}", srx->transport.family, &addr.s_addr); + + if (srx->transport.family != AF_INET) { + WARN(true, "AFS does not yes support non-IPv4 addresses\n"); + return NULL; + } read_lock(&afs_servers_lock); diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 340afd0cd182..94bcd97d22b8 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -58,22 +58,18 @@ static int afs_vl_abort_to_error(u32 abort_code) /* * deliver reply data to a VL.GetEntryByXXX call */ -static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call) { struct afs_cache_vlocation *entry; __be32 *bp; u32 tmp; - int loop; + int loop, ret; - _enter(",,%u", last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter(""); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ entry = call->reply; diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 52976785a32c..45a86396fd2d 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -594,8 +594,8 @@ static void afs_vlocation_reaper(struct work_struct *work) */ int __init afs_vlocation_update_init(void) { - afs_vlocation_update_worker = - create_singlethread_workqueue("kafs_vlupdated"); + afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated", + WQ_MEM_RECLAIM, 0); return afs_vlocation_update_worker ? 0 : -ENOMEM; } @@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); + struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops, + AIO_RING_MAGIC); + + if (!IS_ERR(root)) + root->d_sb->s_iflags |= SB_I_NOEXEC; + return root; } /* aio_setup diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index b493909e7492..d8e6d421c27f 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry, } return NULL; } + /* * Find an eligible tree to time-out * A tree is eligible if :- @@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, struct dentry *root = sb->s_root; struct dentry *dentry; struct dentry *expired; + struct dentry *found; struct autofs_info *ino; if (!root) @@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { + int flags = how; + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_WANT_EXPIRE) - expired = NULL; - else - expired = should_expire(dentry, mnt, timeout, how); - if (!expired) { + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; } + spin_unlock(&sbi->fs_lock); + + expired = should_expire(dentry, mnt, timeout, flags); + if (!expired) + continue; + + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); - if (should_expire(expired, mnt, timeout, how)) { - if (expired != dentry) - dput(dentry); - goto found; - } + /* Make sure a reference is not taken on found if + * things have changed. + */ + flags &= ~AUTOFS_EXP_LEAVES; + found = should_expire(expired, mnt, timeout, how); + if (!found || found != expired) + /* Something has changed, continue */ + goto next; + + if (expired != dentry) + dput(dentry); + + spin_lock(&sbi->fs_lock); + goto found; +next: + spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); if (expired != dentry) dput(expired); - spin_unlock(&sbi->fs_lock); } return NULL; @@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; + int state; /* Block on any pending expire */ if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) @@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) if (rcu_walk) return -ECHILD; +retry: spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_EXPIRING) { + state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING); + if (state == AUTOFS_INF_WANT_EXPIRE) { + spin_unlock(&sbi->fs_lock); + /* + * Possibly being selected for expire, wait until + * it's selected or not. + */ + schedule_timeout_uninterruptible(HZ/10); + goto retry; + } + if (state & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7f6aff3f72eb..2472af2798c7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -853,6 +853,7 @@ static int load_elf_binary(struct linux_binprm *bprm) current->flags |= PF_RANDOMIZE; setup_new_exec(bprm); + install_exec_creds(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ @@ -1044,7 +1045,6 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - install_exec_creds(bprm); retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); if (retval < 0) @@ -1624,20 +1624,12 @@ static void do_thread_regset_writeback(struct task_struct *task, regset->writeback(task, regset, 1); } -#ifndef PR_REG_SIZE -#define PR_REG_SIZE(S) sizeof(S) -#endif - #ifndef PRSTATUS_SIZE -#define PRSTATUS_SIZE(S) sizeof(S) -#endif - -#ifndef PR_REG_PTR -#define PR_REG_PTR(S) (&((S)->pr_reg)) +#define PRSTATUS_SIZE(S, R) sizeof(S) #endif #ifndef SET_PR_FPVALID -#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) +#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V)) #endif static int fill_thread_core_info(struct elf_thread_core_info *t, @@ -1645,6 +1637,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, long signr, size_t *total) { unsigned int i; + unsigned int regset_size = view->regsets[0].n * view->regsets[0].size; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1653,12 +1646,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus, t->task, signr); - (void) view->regsets[0].get(t->task, &view->regsets[0], - 0, PR_REG_SIZE(t->prstatus.pr_reg), - PR_REG_PTR(&t->prstatus), NULL); + (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size, + &t->prstatus.pr_reg, NULL); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - PRSTATUS_SIZE(t->prstatus), &t->prstatus); + PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1688,7 +1680,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset->core_note_type, size, data); else { - SET_PR_FPVALID(&t->prstatus, 1); + SET_PR_FPVALID(&t->prstatus, + 1, regset_size); fill_note(&t->notes[i], "CORE", NT_PRFPREG, size, data); } diff --git a/fs/block_dev.c b/fs/block_dev.c index c3cdde87cc8c..08ae99343d92 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -249,7 +249,8 @@ struct super_block *freeze_bdev(struct block_device *bdev) * thaw_bdev drops it. */ sb = get_super(bdev); - drop_super(sb); + if (sb) + drop_super(sb); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } @@ -646,7 +647,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type, { struct dentry *dent; dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); - if (dent) + if (!IS_ERR(dent)) dent->d_sb->s_iflags |= SB_I_CGROUPWB; return dent; } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2b88439c2ee8..455a6b2fd539 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode) list_del(&ref2->list); kmem_cache_free(btrfs_prelim_ref_cache, ref2); + cond_resched(); } } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2fe8f89091a3..33fe03551105 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -427,6 +427,7 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + u64 tickets_id; struct rw_semaphore groups_sem; /* for block groups in our same type */ @@ -1028,6 +1029,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; + bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ /* filesystem state */ unsigned long fs_state; @@ -1079,6 +1081,8 @@ struct btrfs_fs_info { struct list_head pinned_chunks; int creating_free_space_tree; + /* Used to record internally whether fs has been frozen */ + int fs_frozen; }; struct btrfs_subvolume_writers { @@ -2578,7 +2582,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index d9ddcfc18c91..ac02e041464b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, - delayed_refs, - qrecord); - if (qexisting) + if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info, + delayed_refs, qrecord)) kfree(qrecord); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59febfb8d04a..54bc8c7c6bcd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -559,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root, u32 nritems = btrfs_header_nritems(leaf); int slot; - if (nritems == 0) + if (nritems == 0) { + struct btrfs_root *check_root; + + key.objectid = btrfs_header_owner(leaf); + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + check_root = btrfs_get_fs_root(root->fs_info, &key, false); + /* + * The only reason we also check NULL here is that during + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { + /* if leaf is the root, then it's fine */ + if (leaf->start != + btrfs_root_bytenr(&check_root->root_item)) { + CORRUPT("non-root leaf's nritems is 0", + leaf, root, 0); + return -EIO; + } + } return 0; + } /* Check the 0 item */ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != @@ -612,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } +static int check_node(struct btrfs_root *root, struct extent_buffer *node) +{ + unsigned long nr = btrfs_header_nritems(node); + + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) { + btrfs_crit(root->fs_info, + "corrupt node: block %llu root %llu nritems %lu", + node->start, root->objectid, nr); + return -EIO; + } + return 0; +} + static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -682,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; } + if (found_level > 0 && check_node(root, eb)) + ret = -EIO; + if (!ret) set_extent_buffer_uptodate(eb); err: @@ -1618,8 +1655,8 @@ fail: return ret; } -static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; @@ -2298,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; fs_info->qgroup_ulist = NULL; + fs_info->qgroup_rescan_running = false; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2624,6 +2662,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->fs_frozen = 0; fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; @@ -3739,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); + if (root->reloc_root) { + free_extent_buffer(root->reloc_root->node); + free_extent_buffer(root->reloc_root->commit_root); + btrfs_put_fs_root(root->reloc_root); + root->reloc_root = NULL; + } + } if (root->free_ino_pinned) __btrfs_remove_free_space_cache(root->free_ino_pinned); @@ -3851,7 +3897,7 @@ void close_ctree(struct btrfs_root *root) smp_mb(); /* wait for the qgroup rescan worker to stop */ - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b3207a0e09f7..f19a982f5a4f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 61b494e8e604..665da8f66ff1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -60,21 +60,6 @@ enum { CHUNK_ALLOC_FORCE = 2, }; -/* - * Control how reservations are dealt with. - * - * RESERVE_FREE - freeing a reservation. - * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for - * ENOSPC accounting - * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update - * bytes_may_use as the ENOSPC accounting is done elsewhere - */ -enum { - RESERVE_FREE = 0, - RESERVE_ALLOC = 1, - RESERVE_ALLOC_NO_ACCOUNT = 2, -}; - static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -104,9 +89,10 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, - int delalloc); +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc); +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, @@ -3501,7 +3487,6 @@ again: dcs = BTRFS_DC_SETUP; else if (ret == -ENOSPC) set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); - btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -4286,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) if (ret < 0) return ret; - /* - * Use new btrfs_qgroup_reserve_data to reserve precious data space - * - * TODO: Find a good method to avoid reserve data space for NOCOW - * range, but don't impact performance on quota disable case. - */ + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, start, len); + if (ret) + btrfs_free_reserved_data_space_noquota(inode, start, len); return ret; } @@ -4472,6 +4454,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } } +/* + * If force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + * If force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + */ static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force) { @@ -4882,7 +4873,7 @@ static int flush_space(struct btrfs_root *root, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); - if (ret == -ENOSPC) + if (ret > 0 || ret == -ENOSPC) ret = 0; break; case COMMIT_TRANS: @@ -4907,11 +4898,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, u64 expected; u64 to_reclaim = 0; - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return 0; - list_for_each_entry(ticket, &space_info->tickets, list) to_reclaim += ticket->bytes; list_for_each_entry(ticket, &space_info->priority_tickets, list) @@ -4919,6 +4905,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, if (to_reclaim) return to_reclaim; + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) + return 0; + used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -4972,12 +4963,12 @@ static void wake_all_tickets(struct list_head *head) */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { - struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; int commit_cycles = 0; + u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -4990,8 +4981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - last_ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); + last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; @@ -5011,10 +5001,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (last_ticket == ticket) { + if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { - last_ticket = ticket; + last_tickets_id = space_info->tickets_id; flush_state = FLUSH_DELAYED_ITEMS_NR; if (commit_cycles) commit_cycles--; @@ -5390,6 +5380,7 @@ again: list_del_init(&ticket->list); num_bytes -= ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { ticket->bytes -= num_bytes; @@ -5432,6 +5423,7 @@ again: num_bytes -= ticket->bytes; space_info->bytes_may_use += ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { trace_btrfs_space_reservation(fs_info, "space_info", @@ -6497,19 +6489,15 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) } /** - * btrfs_update_reserved_bytes - update the block_group and space info counters + * btrfs_add_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating + * @ram_bytes: The number of bytes of file content, and will be same to + * @num_bytes except for the compress path. * @num_bytes: The number of bytes in question - * @reserve: One of the reservation enums * @delalloc: The blocks are allocated for the delalloc write * - * This is called by the allocator when it reserves space, or by somebody who is - * freeing space that was never actually used on disk. For example if you - * reserve some space for a new leaf in transaction A and before transaction A - * commits you free that leaf, you call this with reserve set to 0 in order to - * clear the reservation. - * - * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * This is called by the allocator when it reserves space. Metadata + * reservations should be called with RESERVE_ALLOC so we do the proper * ENOSPC accounting. For data we handle the reservation through clearing the * delalloc bits in the io_tree. We have to do this since we could end up * allocating less disk space for the amount of data we have reserved in the @@ -6519,44 +6507,63 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) * make the reservation and return -EAGAIN, otherwise this function always * succeeds. */ -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int delalloc) +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (reserve != RESERVE_FREE) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - if (reserve == RESERVE_ALLOC) { - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - num_bytes, 0); - space_info->bytes_may_use -= num_bytes; - } - - if (delalloc) - cache->delalloc_bytes += num_bytes; - } + if (cache->ro) { + ret = -EAGAIN; } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, + "space_info", space_info->flags, + ram_bytes, 0); + space_info->bytes_may_use -= ram_bytes; if (delalloc) - cache->delalloc_bytes -= num_bytes; + cache->delalloc_bytes += num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; } +/** + * btrfs_free_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk. For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ + +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -7191,7 +7198,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; @@ -7416,9 +7423,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * the free space extent currently. */ static noinline int find_free_extent(struct btrfs_root *orig_root, - u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, - u64 flags, int delalloc) + u64 ram_bytes, u64 num_bytes, u64 empty_size, + u64 hint_byte, struct btrfs_key *ins, + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -7430,8 +7437,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); - int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? - RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -7763,8 +7768,8 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, - alloc_type, delalloc); + ret = btrfs_add_reserved_bytes(block_group, ram_bytes, + num_bytes, delalloc); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -7936,7 +7941,7 @@ again: up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) @@ -7948,8 +7953,8 @@ int btrfs_reserve_extent(struct btrfs_root *root, flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, - flags, delalloc); + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, + hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(root->fs_info, ins->objectid); @@ -7958,6 +7963,7 @@ again: num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); + ram_bytes = num_bytes; if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -7995,7 +8001,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + btrfs_free_reserved_bytes(cache, len, delalloc); trace_btrfs_reserved_extent_free(root, start, len); } @@ -8208,6 +8214,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; /* * Mixed block groups will exclude before processing the log so we only @@ -8223,9 +8230,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!block_group) return -EINVAL; - ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT, 0); - BUG_ON(ret); /* logic error */ + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + space_info->bytes_reserved += ins->offset; + block_group->reserved += ins->offset; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); btrfs_put_block_group(block_group); @@ -8368,7 +8380,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); - ret = btrfs_reserve_extent(root, blocksize, blocksize, + ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, empty_size, hint, &ins, 0, 0); if (ret) goto out_unuse; @@ -8521,35 +8533,6 @@ reada: wc->reada_slot = slot; } -/* - * These may not be seen by the usual inc/dec ref code so we have to - * add them here. - */ -static int record_one_subtree_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes) -{ - struct btrfs_qgroup_extent_record *qrecord; - struct btrfs_delayed_ref_root *delayed_refs; - - qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); - if (!qrecord) - return -ENOMEM; - - qrecord->bytenr = bytenr; - qrecord->num_bytes = num_bytes; - qrecord->old_roots = NULL; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, - delayed_refs, qrecord)) - kfree(qrecord); - spin_unlock(&delayed_refs->lock); - - return 0; -} - static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) @@ -8583,7 +8566,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + bytenr, num_bytes, GFP_NOFS); if (ret) return ret; } @@ -8732,8 +8716,9 @@ walk_down: btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - ret = record_one_subtree_extent(trans, root, child_bytenr, - root->nodesize); + ret = btrfs_qgroup_insert_dirty_extent(trans, + root->fs_info, child_bytenr, + root->nodesize, GFP_NOFS); if (ret) goto out; } @@ -9906,6 +9891,7 @@ static int find_first_block_group(struct btrfs_root *root, } else { ret = 0; } + free_extent_map(em); goto out; } path->slots[0]++; @@ -9942,6 +9928,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) block_group->iref = 0; block_group->inode = NULL; spin_unlock(&block_group->lock); + ASSERT(block_group->io_ctl.inode == NULL); iput(inode); last = block_group->key.objectid + block_group->key.offset; btrfs_put_block_group(block_group); @@ -9999,6 +9986,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) free_excluded_extents(info->extent_root, block_group); btrfs_remove_free_space_cache(block_group); + ASSERT(list_empty(&block_group->dirty_list)); + ASSERT(list_empty(&block_group->io_list)); + ASSERT(list_empty(&block_group->bg_list)); + ASSERT(atomic_read(&block_group->count) == 1); btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bc2729a7612d..28cd88fccc7e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,6 +20,7 @@ #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) +#define EXTENT_CLEAR_DATA_RESV (1U << 17) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5842423f8f47..fea31a4a6e36 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2070,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - btrfs_init_log_ctx(&ctx); + btrfs_init_log_ctx(&ctx, inode); ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { @@ -2675,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start = round_down(offset, blocksize); alloc_end = round_up(offset + len, blocksize); + cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2767,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); - cur_offset = alloc_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); @@ -2794,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte - cur_offset); if (ret < 0) break; + } else { + /* + * Do not need to reserve unwritten extent for this + * range, free reserved data space first, otherwise + * it'll result in false ENOSPC error. + */ + btrfs_free_reserved_data_space(inode, cur_offset, + last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2811,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->start, range->len, 1 << inode->i_blkbits, offset + len, &alloc_hint); + else + btrfs_free_reserved_data_space(inode, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2845,18 +2856,11 @@ out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_KERNEL); out: - /* - * As we waited the extent range, the data_rsv_map must be empty - * in the range, as written data range will be released from it. - * And for prealloacted extent, it will also be released when - * its metadata is written. - * So this is completely used as cleanup. - */ - btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); inode_unlock(inode); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - alloc_start); + if (ret != 0) + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - cur_offset); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index aa6fabaee72e..359ee861b5a4 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -495,10 +495,9 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_space(inode, 0, prealloc); + btrfs_delalloc_release_metadata(inode, prealloc); goto out_put; } - btrfs_free_reserved_data_space(inode, 0, prealloc); ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 08dfc57e2270..e6811c42e41e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -566,6 +566,8 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); goto free_pages_out; } } @@ -742,7 +744,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); - ret = btrfs_reserve_extent(root, + ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); @@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); - + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode, unsigned long op; cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) @@ -1489,8 +1492,10 @@ out_check: extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC, PAGE_UNLOCK | - PAGE_SET_PRIVATE2); + EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; @@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode, return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE)) + && do_list && !(state->state & EXTENT_NORESERVE) + && (*bits & (EXTENT_DO_ACCOUNTING | + EXTENT_CLEAR_DATA_RESV))) btrfs_free_reserved_data_space_noquota(inode, state->start, len); @@ -7251,7 +7258,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, + ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); @@ -7751,6 +7758,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ret = PTR_ERR(em2); goto unlock_err; } + /* + * For inode marked NODATACOW or extent marked PREALLOC, + * use the existing or preallocated extent, so does not + * need to adjust btrfs_space_info's bytes_may_use. + */ + btrfs_free_reserved_data_space_noquota(inode, + start, len); goto unlock; } } @@ -7785,7 +7799,6 @@ unlock: i_size_write(inode, start + len); adjust_dio_outstanding_extents(inode, dio_data, len); - btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; dio_data->unsubmitted_oe_range_end = start + len; @@ -10306,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; + u64 end = start + num_bytes - 1; if (trans) own_trans = false; @@ -10327,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, * sized chunks. */ cur_bytes = min(cur_bytes, last_alloc); - ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, - *alloc_hint, &ins, 1, 0); + ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, + min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -10414,6 +10428,9 @@ next: if (own_trans) btrfs_end_transaction(trans, root); } + if (cur_offset < end) + btrfs_free_reserved_data_space(inode, cur_offset, + end - cur_offset + 1); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14ed1e9e6bc8..7fd939bfbd99 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1634,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, int namelen; int ret = 0; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + ret = mnt_want_write_file(file); if (ret) goto out; @@ -1691,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, struct btrfs_ioctl_vol_args *vol_args; int ret; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1714,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -2357,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int ret; int err = 0; + if (!S_ISDIR(dir->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -5084,7 +5096,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_qgroup_wait_for_completion(root->fs_info); + return btrfs_qgroup_wait_for_completion(root->fs_info, true); } static long _btrfs_ioctl_set_received_subvol(struct file *file, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 93ee1c18ef9d..8db2e29fdcf4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, goto out; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) +int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; @@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, else if (bytenr > entry->bytenr) p = &(*p)->rb_right; else - return entry; + return 1; } rb_link_node(&record->node, parent_node, p); rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); - return NULL; + return 0; +} + +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag) +{ + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0) + return 0; + if (WARN_ON(trans == NULL)) + return -EINVAL; + record = kmalloc(sizeof(*record), gfp_flag); + if (!record) + return -ENOMEM; + + delayed_refs = &trans->transaction->delayed_refs; + record->bytenr = bytenr; + record->num_bytes = num_bytes; + record->old_roots = NULL; + + spin_lock(&delayed_refs->lock); + ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs, + record); + spin_unlock(&delayed_refs->lock); + if (ret > 0) + kfree(record); + return 0; } #define UPDATE_NEW 0 @@ -2303,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) int err = -ENOMEM; int ret = 0; + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; + mutex_unlock(&fs_info->qgroup_rescan_lock); + path = btrfs_alloc_path(); if (!path) goto out; @@ -2369,6 +2402,9 @@ out: } done: + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = false; + mutex_unlock(&fs_info->qgroup_rescan_lock); complete_all(&fs_info->qgroup_rescan_completion); } @@ -2487,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible) { int running; int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + running = fs_info->qgroup_rescan_running; spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - if (running) + if (!running) + return 0; + + if (interruptible) ret = wait_for_completion_interruptible( &fs_info->qgroup_rescan_completion); + else + wait_for_completion(&fs_info->qgroup_rescan_completion); return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 710887c06aaf..1bc64c864b62 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, @@ -63,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * No lock version, caller must acquire delayed ref lock and allocate memory. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ +int btrfs_qgroup_insert_dirty_extent_nolock( + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); + +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * Better encapsulated version. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag); + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b26a5aea41b4..c0c13dc6fe12 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -31,6 +31,7 @@ #include "async-thread.h" #include "free-space-cache.h" #include "inode-map.h" +#include "qgroup.h" /* * backref_node, mapping_node and tree_block start with this @@ -3037,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 num_bytes; int nr = 0; int ret = 0; + u64 prealloc_start = cluster->start - offset; + u64 prealloc_end = cluster->end - offset; + u64 cur_offset; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, cluster->start, - cluster->end + 1 - cluster->start); + ret = btrfs_check_data_free_space(inode, prealloc_start, + prealloc_end + 1 - prealloc_start); if (ret) goto out; + cur_offset = prealloc_start; while (nr < cluster->nr) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -3055,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; + if (cur_offset < start) + btrfs_free_reserved_data_space(inode, cur_offset, + start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); + cur_offset = end + 1; unlock_extent(&BTRFS_I(inode)->io_tree, start, end); if (ret) break; nr++; } - btrfs_free_reserved_data_space(inode, cluster->start, - cluster->end + 1 - cluster->start); + if (cur_offset < prealloc_end) + btrfs_free_reserved_data_space(inode, cur_offset, + prealloc_end + 1 - cur_offset); out: inode_unlock(inode); return ret; @@ -3916,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc) return 0; } +/* + * Qgroup fixer for data chunk relocation. + * The data relocation is done in the following steps + * 1) Copy data extents into data reloc tree + * 2) Create tree reloc tree(special snapshot) for related subvolumes + * 3) Modify file extents in tree reloc tree + * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks + * + * The problem is, data and tree reloc tree are not accounted to qgroup, + * and 4) will only info qgroup to track tree blocks change, not file extents + * in the tree blocks. + * + * The good news is, related data extents are all in data reloc tree, so we + * only need to info qgroup to track all file extents in data reloc tree + * before commit trans. + */ +static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct inode *inode = rc->data_inode; + struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + /* + * Only for stage where we update data pointers the qgroup fix is + * valid. + * For MOVING_DATA stage, we will miss the timing of swapping tree + * blocks, and won't fix it. + */ + if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1); + while (1) { + struct btrfs_file_extent_item *fi; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid > btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + goto next; + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(path->nodes[0], fi) != + BTRFS_FILE_EXTENT_REG) + goto next; + ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info, + btrfs_file_extent_disk_bytenr(path->nodes[0], fi), + btrfs_file_extent_disk_num_bytes(path->nodes[0], fi), + GFP_NOFS); + if (ret < 0) + break; +next: + ret = btrfs_next_item(data_reloc_root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + } + unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1); +out: + btrfs_free_path(path); + return ret; +} + static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; @@ -4102,10 +4196,18 @@ restart: /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { err = PTR_ERR(trans); - else - btrfs_commit_transaction(trans, rc->extent_root); + goto out_free; + } + ret = qgroup_fix_relocated_data_extents(trans, rc); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + if (!err) + err = ret; + goto out_free; + } + btrfs_commit_transaction(trans, rc->extent_root); out_free: btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); btrfs_free_path(path); @@ -4468,10 +4570,16 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { err = PTR_ERR(trans); - else - err = btrfs_commit_transaction(trans, rc->extent_root); + goto out_free; + } + err = qgroup_fix_relocated_data_extents(trans, rc); + if (err < 0) { + btrfs_abort_transaction(trans, err); + goto out_free; + } + err = btrfs_commit_transaction(trans, rc->extent_root); out_free: kfree(rc); out: diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7fd7e1830cfe..091296062456 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid = key.offset; key.offset++; + /* + * The root might have been inserted already, as before we look + * for orphan roots, log replay might have happened, which + * triggers a transaction commit and qgroup accounting, which + * in turn reads and inserts fs roots while doing backref + * walking. + */ + root = btrfs_lookup_fs_root(tree_root->fs_info, + root_key.objectid); + if (root) { + WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state)); + if (btrfs_root_refs(&root->root_item) == 0) + btrfs_add_dead_root(root); + continue; + } + root = btrfs_read_fs_root(tree_root, &root_key); err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { @@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); - /* - * The root might have been inserted already, as before we look - * for orphan roots, log replay might have happened, which - * triggers a transaction commit and qgroup accounting, which - * in turn reads and inserts fs roots while doing backref - * walking. - */ - if (err == -EEXIST) - err = 0; if (err) { + BUG_ON(err == -EEXIST); btrfs_free_fs_root(root); break; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index efe129fe2678..a87675ffd02b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4268,10 +4268,12 @@ static int process_all_refs(struct send_ctx *sctx, } btrfs_release_path(path); + /* + * We don't actually care about pending_move as we are simply + * re-creating this inode and will be rename'ing it into place once we + * rename the parent directory. + */ ret = process_recorded_refs(sctx, &pending_move); - /* Only applicable to an incremental send. */ - ASSERT(pending_move == 0); - out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 864ce334f696..4071fe2bd098 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2241,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_sb(sb)->tree_root; + root->fs_info->fs_frozen = 1; + /* + * We don't need a barrier here, we'll wait for any transaction that + * could be in progress on other threads (and do delayed iputs that + * we want to avoid on a frozen filesystem), or do the commit + * ourselves. + */ trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ @@ -2251,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans, root); } +static int btrfs_unfreeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb)->tree_root; + + root->fs_info->fs_frozen = 0; + return 0; +} + static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -2299,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, + .unfreeze_fs = btrfs_unfreeze, }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9cca0a721961..95d41919d034 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2278,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); + /* + * If fs has been frozen, we can not handle delayed iputs, otherwise + * it'll result in deadlock about SB_FREEZE_FS. + */ if (current != root->fs_info->transaction_kthread && - current != root->fs_info->cleaner_kthread) + current != root->fs_info->cleaner_kthread && + !root->fs_info->fs_frozen) btrfs_run_delayed_iputs(root); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fff3f3efa436..ef9c55bc7907 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "backref.h" #include "hash.h" #include "compression.h" +#include "qgroup.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * Manually record dirty extent, as here we did a shallow + * file extent item copy and skip normal backref update, + * but modifying extent tree all by ourselves. + * So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree + * (doesn't affect qgroup) to fs/file tree(affects qgroup) + */ + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + btrfs_file_extent_disk_bytenr(eb, item), + btrfs_file_extent_disk_num_bytes(eb, item), + GFP_NOFS); + if (ret < 0) + goto out; + if (ins.objectid > 0) { u64 csum_start; u64 csum_end; @@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); - btrfs_init_log_ctx(&root_log_ctx); + btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); @@ -2851,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; @@ -4741,7 +4758,8 @@ again: if (ret < 0) { err = ret; goto out_unlock; - } else if (ret > 0) { + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(ctx->inode)) { struct btrfs_key inode_key; struct inode *other_inode; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a9f1b75d080d..ab858e31ccbc 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -30,15 +30,18 @@ struct btrfs_log_ctx { int log_transid; int io_err; bool log_new_dentries; + struct inode *inode; struct list_head list; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, + struct inode *inode) { ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; ctx->log_new_dentries = false; + ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 51f125508771..035efce603a9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -834,10 +834,6 @@ static void __free_device(struct work_struct *work) struct btrfs_device *device; device = container_of(work, struct btrfs_device, rcu_work); - - if (device->bdev) - blkdev_put(device->bdev, device->mode); - rcu_string_free(device->name); kfree(device); } @@ -852,6 +848,17 @@ static void free_device(struct rcu_head *head) schedule_work(&device->rcu_work); } +static void btrfs_close_bdev(struct btrfs_device *device) +{ + if (device->bdev && device->writeable) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + + if (device->bdev) + blkdev_put(device->bdev, device->mode); +} + static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; @@ -870,10 +877,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->missing) fs_devices->missing_devices--; - if (device->bdev && device->writeable) { - sync_blockdev(device->bdev); - invalidate_bdev(device->bdev); - } + btrfs_close_bdev(device); new_device = btrfs_alloc_device(NULL, &device->devid, device->uuid); @@ -1932,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); } + btrfs_close_bdev(device); + call_rcu(&device->rcu, free_device); num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; @@ -2025,6 +2031,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, /* zero out the old super if it is writable */ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); } + + btrfs_close_bdev(srcdev); + call_rcu(&srcdev->rcu, free_device); /* @@ -2080,6 +2089,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, * the device_list_mutex lock. */ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + + btrfs_close_bdev(tgtdev); call_rcu(&tgtdev->rcu, free_device); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index c64a0b794d49..df4b3e6fa563 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -597,7 +597,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) if (is_hash_order(new_pos)) { /* no need to reset last_name for a forward seek when * dentries are sotred in hash order */ - } else if (fi->frag |= fpos_frag(new_pos)) { + } else if (fi->frag != fpos_frag(new_pos)) { return true; } rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 6bbec5e784cd..14ae4b8e1a3c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -609,6 +609,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *s, *p; char sep; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + return dget(sb->s_root); + full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); if (full_path == NULL) @@ -686,26 +689,22 @@ cifs_do_mount(struct file_system_type *fs_type, cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); if (cifs_sb->mountdata == NULL) { root = ERR_PTR(-ENOMEM); - goto out_cifs_sb; + goto out_free; } - if (volume_info->prepath) { - cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL); - if (cifs_sb->prepath == NULL) { - root = ERR_PTR(-ENOMEM); - goto out_cifs_sb; - } + rc = cifs_setup_cifs_sb(volume_info, cifs_sb); + if (rc) { + root = ERR_PTR(rc); + goto out_free; } - cifs_setup_cifs_sb(volume_info, cifs_sb); - rc = cifs_mount(cifs_sb, volume_info); if (rc) { if (!(flags & MS_SILENT)) cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", rc); root = ERR_PTR(rc); - goto out_mountdata; + goto out_free; } mnt_data.vol = volume_info; @@ -735,11 +734,7 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) - root = dget(sb->s_root); - else - root = cifs_get_root(volume_info, sb); - + root = cifs_get_root(volume_info, sb); if (IS_ERR(root)) goto out_super; @@ -752,9 +747,9 @@ out: cifs_cleanup_volume_info(volume_info); return root; -out_mountdata: +out_free: + kfree(cifs_sb->prepath); kfree(cifs_sb->mountdata); -out_cifs_sb: kfree(cifs_sb); out_nls: unload_nls(volume_info->local_nls); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1243bd326591..95dab43646f0 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -184,7 +184,7 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int to_read); -extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, +extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7ae03283bd61..2e4f4bad8b1e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2781,6 +2781,24 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) return 1; } +static int +match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) +{ + struct cifs_sb_info *old = CIFS_SB(sb); + struct cifs_sb_info *new = mnt_data->cifs_sb; + + if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { + if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) + return 0; + /* The prepath should be null terminated strings */ + if (strcmp(new->prepath, old->prepath)) + return 0; + + return 1; + } + return 0; +} + int cifs_match_super(struct super_block *sb, void *data) { @@ -2808,7 +2826,8 @@ cifs_match_super(struct super_block *sb, void *data) if (!match_server(tcp_srv, volume_info) || !match_session(ses, volume_info) || - !match_tcon(tcon, volume_info->UNC)) { + !match_tcon(tcon, volume_info->UNC) || + !match_prepath(sb, mnt_data)) { rc = 0; goto out; } @@ -3222,7 +3241,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, } } -void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, +int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb) { INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); @@ -3316,6 +3335,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n"); + + if (pvolume_info->prepath) { + cifs_sb->prepath = kstrdup(pvolume_info->prepath, GFP_KERNEL); + if (cifs_sb->prepath == NULL) + return -ENOMEM; + } + + return 0; } static void diff --git a/fs/configfs/file.c b/fs/configfs/file.c index c30cf49b69d2..2c6312db8516 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -333,6 +333,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf, if (bin_attr->cb_max_size && *ppos + count > bin_attr->cb_max_size) { len = -EFBIG; + goto out; } tbuf = vmalloc(*ppos + count); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 0f9961eede1e..ed115acb5dee 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,6 +11,7 @@ #include <linux/random.h> #include <linux/string.h> #include <linux/fscrypto.h> +#include <linux/mount.h> static int inode_has_encryption_context(struct inode *inode) { @@ -92,26 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct inode *inode, +int fscrypt_process_policy(struct file *filp, const struct fscrypt_policy *policy) { + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + if (policy->version != 0) return -EINVAL; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!inode_has_encryption_context(inode)) { - if (!inode->i_sb->s_cop->empty_dir) - return -EOPNOTSUPP; - if (!inode->i_sb->s_cop->empty_dir(inode)) - return -ENOTEMPTY; - return create_encryption_context_from_policy(inode, policy); + if (!S_ISDIR(inode->i_mode)) + ret = -EINVAL; + else if (!inode->i_sb->s_cop->empty_dir) + ret = -EOPNOTSUPP; + else if (!inode->i_sb->s_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = create_encryption_context_from_policy(inode, + policy); + } else if (!is_encryption_context_consistent_with_policy(inode, + policy)) { + printk(KERN_WARNING + "%s: Policy inconsistent with encryption context\n", + __func__); + ret = -EINVAL; } - if (is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; + mnt_drop_write_file(filp); + return ret; } EXPORT_SYMBOL(fscrypt_process_policy); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 592059f88e04..354e2ab62031 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -97,9 +97,6 @@ EXPORT_SYMBOL_GPL(debugfs_use_file_finish); #define F_DENTRY(filp) ((filp)->f_path.dentry) -#define REAL_FOPS_DEREF(dentry) \ - ((const struct file_operations *)(dentry)->d_fsdata) - static int open_proxy_open(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); @@ -112,7 +109,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -143,7 +140,7 @@ static ret_type full_proxy_ ## name(proto) \ { \ const struct dentry *dentry = F_DENTRY(filp); \ const struct file_operations *real_fops = \ - REAL_FOPS_DEREF(dentry); \ + debugfs_real_fops(filp); \ int srcu_idx; \ ret_type r; \ \ @@ -176,7 +173,7 @@ static unsigned int full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); int srcu_idx; unsigned int r = 0; @@ -193,7 +190,7 @@ static unsigned int full_proxy_poll(struct file *filp, static int full_proxy_release(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); const struct file_operations *proxy_fops = filp->f_op; int r = 0; @@ -209,7 +206,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp) replace_fops(filp, d_inode(dentry)->i_fop); kfree((void *)proxy_fops); fops_put(real_fops); - return 0; + return r; } static void __full_proxy_fops_init(struct file_operations *proxy_fops, @@ -241,7 +238,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index bba52634b995..b3e8443a1f47 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -19,8 +19,4 @@ extern const struct file_operations debugfs_noop_file_operations; extern const struct file_operations debugfs_open_proxy_file_operations; extern const struct file_operations debugfs_full_proxy_file_operations; -struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops); - #endif /* _DEBUGFS_INTERNAL_H_ */ diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index d116453b0276..442d1a7e671b 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -272,13 +272,8 @@ static int mknod_ptmx(struct super_block *sb) struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - kuid_t root_uid; - kgid_t root_gid; - - root_uid = make_kuid(current_user_ns(), 0); - root_gid = make_kgid(current_user_ns(), 0); - if (!uid_valid(root_uid) || !gid_valid(root_gid)) - return -EINVAL; + kuid_t ptmx_uid = current_fsuid(); + kgid_t ptmx_gid = current_fsgid(); inode_lock(d_inode(root)); @@ -309,8 +304,8 @@ static int mknod_ptmx(struct super_block *sb) mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); - inode->i_uid = root_uid; - inode->i_gid = root_gid; + inode->i_uid = ptmx_uid; + inode->i_gid = ptmx_gid; d_add(dentry, inode); @@ -336,7 +331,6 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - sync_filesystem(sb); err = parse_mount_options(data, PARSE_REMOUNT, opts); /* @@ -395,6 +389,7 @@ static int devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; @@ -403,10 +398,16 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; + error = -ENOMEM; s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; + error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); + if (error) + goto fail; + + error = -ENOMEM; inode = new_inode(s); if (!inode) goto fail; @@ -418,13 +419,21 @@ devpts_fill_super(struct super_block *s, void *data, int silent) set_nlink(inode, 2); s->s_root = d_make_root(inode); - if (s->s_root) - return 0; + if (!s->s_root) { + pr_err("get root dentry failed\n"); + goto fail; + } - pr_err("get root dentry failed\n"); + error = mknod_ptmx(s); + if (error) + goto fail_dput; + return 0; +fail_dput: + dput(s->s_root); + s->s_root = NULL; fail: - return -ENOMEM; + return error; } /* @@ -436,43 +445,15 @@ fail: static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - int error; - struct pts_mount_opts opts; - struct super_block *s; - - error = parse_mount_options(data, PARSE_MOUNT, &opts); - if (error) - return ERR_PTR(error); - - s = sget(fs_type, NULL, set_anon_super, flags, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (!s->s_root) { - error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); - if (error) - goto out_undo_sget; - s->s_flags |= MS_ACTIVE; - } - - memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); - - error = mknod_ptmx(s); - if (error) - goto out_undo_sget; - - return dget(s->s_root); - -out_undo_sget: - deactivate_locked_super(s); - return ERR_PTR(error); + return mount_nodev(fs_type, flags, data, devpts_fill_super); } static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); - ida_destroy(&fsi->allocated_ptys); + if (fsi) + ida_destroy(&fsi->allocated_ptys); kfree(fsi); kill_litter_super(sb); } @@ -585,7 +566,8 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) */ void *devpts_get_priv(struct dentry *dentry) { - WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); + if (dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC) + return NULL; return dentry->d_fsdata; } diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index eea64912c9c0..466f7d60edc2 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -607,20 +607,54 @@ static const struct file_operations format2_fops; static const struct file_operations format3_fops; static const struct file_operations format4_fops; -static int table_open(struct inode *inode, struct file *file) +static int table_open1(struct inode *inode, struct file *file) { struct seq_file *seq; - int ret = -1; + int ret; - if (file->f_op == &format1_fops) - ret = seq_open(file, &format1_seq_ops); - else if (file->f_op == &format2_fops) - ret = seq_open(file, &format2_seq_ops); - else if (file->f_op == &format3_fops) - ret = seq_open(file, &format3_seq_ops); - else if (file->f_op == &format4_fops) - ret = seq_open(file, &format4_seq_ops); + ret = seq_open(file, &format1_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open2(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format2_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open3(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format3_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open4(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + ret = seq_open(file, &format4_seq_ops); if (ret) return ret; @@ -631,7 +665,7 @@ static int table_open(struct inode *inode, struct file *file) static const struct file_operations format1_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open1, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -639,7 +673,7 @@ static const struct file_operations format1_fops = { static const struct file_operations format2_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open2, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -647,7 +681,7 @@ static const struct file_operations format2_fops = { static const struct file_operations format3_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open3, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -655,7 +689,7 @@ static const struct file_operations format3_fops = { static const struct file_operations format4_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open4, .read = seq_read, .llseek = seq_lseek, .release = seq_release diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 1d73fc6dba13..cbb50cadcffc 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -105,7 +105,10 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, inode->i_private = var; - efivar_entry_add(var, &efivarfs_list); + err = efivar_entry_add(var, &efivarfs_list); + if (err) + goto out; + d_instantiate(dentry, inode); dget(dentry); out: diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 688ccc16b702..d7a7c53803c1 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -157,12 +157,14 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, goto fail_inode; } + efivar_entry_size(entry, &size); + err = efivar_entry_add(entry, &efivarfs_list); + if (err) + goto fail_inode; + /* copied by the above to local storage in the dentry. */ kfree(name); - efivar_entry_size(entry, &size); - efivar_entry_add(entry, &efivarfs_list); - inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); @@ -182,7 +184,10 @@ fail: static int efivarfs_destroy(struct efivar_entry *entry, void *data) { - efivar_entry_remove(entry); + int err = efivar_entry_remove(entry); + + if (err) + return err; kfree(entry); return 0; } diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index efe5fb21c533..04e73a99902b 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -465,6 +465,11 @@ struct inode *ext2_new_inode(struct inode *dir, umode_t mode, for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext2_get_group_desc(sb, group, &bh2); + if (!gdp) { + if (++group == sbi->s_groups_count) + group = 0; + continue; + } brelse(bitmap_bh); bitmap_bh = read_inode_bitmap(sb, group); if (!bitmap_bh) { diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c7dbb4661119..1e72d425fd3b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -732,6 +732,16 @@ static int ext2_get_blocks(struct inode *inode, } if (IS_DAX(inode)) { + int i; + + /* + * We must unmap blocks before zeroing so that writeback cannot + * overwrite zeros with stale data from block device page cache. + */ + for (i = 0; i < count; i++) { + unmap_underlying_metadata(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key) + i); + } /* * block must be initialised before we put it in the tree * so that it's not found by another thread before it's diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3131747199e1..c6ea25a190f8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5466,8 +5466,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) sbi->s_want_extra_isize, iloc, handle); if (ret) { - ext4_set_inode_state(inode, - EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { ext4_warning(inode->i_sb, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 10686fd67fb4..1bb7df5e4536 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -776,7 +776,7 @@ resizefs_out: (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - return fscrypt_process_policy(inode, &policy); + return fscrypt_process_policy(filp, &policy); #else return -EOPNOTSUPP; #endif diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1c593aa0218e..3ec8708989ca 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2211,6 +2211,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, + ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2241,6 +2242,11 @@ static int ext4_check_descriptors(struct super_block *sb, grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u overlaps " + "superblock", i); + } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " @@ -2248,6 +2254,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u overlaps " + "superblock", i); + } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " @@ -2255,6 +2266,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_table = ext4_inode_table(sb, gdp); + if (inode_table == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u overlaps " + "superblock", i); + } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -3757,7 +3773,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { + if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ret = -EFSCORRUPTED; goto failed_mount2; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 39e9cfb1b371..2eb935ca5d9e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1353,15 +1353,19 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, size_t min_offs, free; int total_ino; void *base, *start, *end; - int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + int isize_diff; /* How much do we need to grow i_extra_isize */ down_write(&EXT4_I(inode)->xattr_sem); + /* + * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty + */ + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); retry: - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { - up_write(&EXT4_I(inode)->xattr_sem); - return 0; - } + isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + goto out; header = IHDR(inode, raw_inode); entry = IFIRST(header); @@ -1382,7 +1386,7 @@ retry: goto cleanup; free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= new_extra_isize) { + if (free >= isize_diff) { entry = IFIRST(header); ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + @@ -1390,8 +1394,7 @@ retry: (void *)header, total_ino, inode->i_sb->s_blocksize); EXT4_I(inode)->i_extra_isize = new_extra_isize; - error = 0; - goto cleanup; + goto out; } /* @@ -1414,7 +1417,7 @@ retry: end = bh->b_data + bh->b_size; min_offs = end - base; free = ext4_xattr_free_space(first, &min_offs, base, NULL); - if (free < new_extra_isize) { + if (free < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; @@ -1428,7 +1431,7 @@ retry: free = inode->i_sb->s_blocksize; } - while (new_extra_isize > 0) { + while (isize_diff > 0) { size_t offs, size, entry_size; struct ext4_xattr_entry *small_entry = NULL; struct ext4_xattr_info i = { @@ -1459,7 +1462,7 @@ retry: EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + EXT4_XATTR_LEN(last->e_name_len); if (total_size <= free && total_size < min_total_size) { - if (total_size < new_extra_isize) { + if (total_size < isize_diff) { small_entry = last; } else { entry = last; @@ -1514,22 +1517,22 @@ retry: error = ext4_xattr_ibody_set(handle, inode, &i, is); if (error) goto cleanup; + total_ino -= entry_size; entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) - shift_bytes = new_extra_isize; + if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) + shift_bytes = isize_diff; else - shift_bytes = entry_size + size; + shift_bytes = entry_size + EXT4_XATTR_SIZE(size); /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - shift_bytes, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, - (void *)header, total_ino - entry_size, - inode->i_sb->s_blocksize); + ext4_xattr_shift_entries(entry, -shift_bytes, + (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize + shift_bytes, + (void *)header, total_ino, inode->i_sb->s_blocksize); - extra_isize += shift_bytes; - new_extra_isize -= shift_bytes; - EXT4_I(inode)->i_extra_isize = extra_isize; + isize_diff -= shift_bytes; + EXT4_I(inode)->i_extra_isize += shift_bytes; + header = IHDR(inode, raw_inode); i.name = b_entry_name; i.value = buffer; @@ -1551,6 +1554,8 @@ retry: kfree(bs); } brelse(bh); +out: + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); up_write(&EXT4_I(inode)->xattr_sem); return 0; @@ -1562,6 +1567,10 @@ cleanup: kfree(is); kfree(bs); brelse(bh); + /* + * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode + * size expansion failed. + */ up_write(&EXT4_I(inode)->xattr_sem); return error; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 69dd3e6566e0..a92e783fa057 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -24,6 +24,7 @@ #define EXT4_XATTR_INDEX_SYSTEM 7 #define EXT4_XATTR_INDEX_RICHACL 8 #define EXT4_XATTR_INDEX_ENCRYPTION 9 +#define EXT4_XATTR_INDEX_HURD 10 /* Reserved for Hurd */ struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d64d2a515cb2..ccb401eebc11 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1699,11 +1699,11 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); set_page_dirty(page); - f2fs_put_page(page, 1); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); + f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 675fa79d86f6..14f5fe2b841e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -538,7 +538,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct percpu_rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ @@ -787,7 +787,7 @@ struct f2fs_sb_info { struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ - struct percpu_rw_semaphore cp_rwsem; /* blocking FS operations */ + struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ @@ -1074,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - percpu_down_read(&sbi->cp_rwsem); + down_read(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - percpu_up_read(&sbi->cp_rwsem); + up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - percpu_down_write(&sbi->cp_rwsem); + down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - percpu_up_write(&sbi->cp_rwsem); + up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0e493f63ea41..28f4f4cbb8d8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - int ret; if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - ret = mnt_want_write_file(filp); - if (ret) - return ret; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - ret = fscrypt_process_policy(inode, &policy); - mnt_drop_write_file(filp); - return ret; + return fscrypt_process_policy(filp, &policy); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) @@ -2086,15 +2079,19 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (unlikely(f2fs_readonly(src->i_sb))) return -EROFS; - if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode)) - return -EISDIR; + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) return -EOPNOTSUPP; inode_lock(src); - if (src != dst) - inode_lock(dst); + if (src != dst) { + if (!inode_trylock(dst)) { + ret = -EBUSY; + goto out; + } + } ret = -EINVAL; if (pos_in + len > src->i_size || pos_in + len < pos_in) @@ -2152,6 +2149,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, out_unlock: if (src != dst) inode_unlock(dst); +out: inode_unlock(src); return ret; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b2fa4b615925..f75d197d5beb 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -206,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool need = false; - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) need = true; } - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return need; } @@ -223,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -237,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return need_update; } @@ -284,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = grab_nat_entry(nm_i, ni->nid); @@ -334,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); } int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -342,7 +342,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; - percpu_down_write(&nm_i->nat_tree_lock); + if (!down_write_trylock(&nm_i->nat_tree_lock)) + return 0; while (nr_shrink && !list_empty(&nm_i->nat_entries)) { struct nat_entry *ne; @@ -351,7 +352,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) __del_from_nat_cache(nm_i, ne); nr_shrink--; } - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -373,13 +374,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) ni->nid = nid; /* Check nat cache */ - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return; } @@ -403,11 +404,11 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); /* cache nat entry */ - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); } /* @@ -1788,7 +1789,7 @@ void build_free_nids(struct f2fs_sb_info *sbi) ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1820,7 +1821,7 @@ void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } up_read(&curseg->journal_rwsem); - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -2209,7 +2210,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); /* * if there are no enough space in journal to store dirty nat @@ -2232,7 +2233,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2268,8 +2269,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); - if (percpu_init_rwsem(&nm_i->nat_tree_lock)) - return -ENOMEM; + init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); @@ -2326,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; @@ -2351,9 +2351,8 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); - percpu_free_rwsem(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); sbi->nm_info = NULL; kfree(nm_i); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1b86d3f638ef..7f863a645ab1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -706,8 +706,6 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); - - percpu_free_rwsem(&sbi->cp_rwsem); } static void f2fs_put_super(struct super_block *sb) @@ -1483,9 +1481,6 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) { int i, err; - if (percpu_init_rwsem(&sbi->cp_rwsem)) - return -ENOMEM; - for (i = 0; i < NR_COUNT_TYPE; i++) { err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); if (err) @@ -1686,6 +1681,7 @@ try_onemore: sbi->write_io[i].bio = NULL; } + init_rwsem(&sbi->cp_rwsem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 1b2f6c2c3aaf..76f09ce7e5b2 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -1,5 +1,6 @@ config FUSE_FS tristate "FUSE (Filesystem in Userspace) support" + select FS_POSIX_ACL help With FUSE it is possible to implement a fully functional filesystem in a userspace program. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index e95eeb445e58..60da84a86dab 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o -fuse-objs := dev.o dir.o file.o inode.o control.o +fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c new file mode 100644 index 000000000000..ec85765502f1 --- /dev/null +++ b/fs/fuse/acl.c @@ -0,0 +1,99 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2016 Canonical Ltd. <seth.forshee@canonical.com> + * + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + */ + +#include "fuse_i.h" + +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> + +struct posix_acl *fuse_get_acl(struct inode *inode, int type) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + int size; + const char *name; + void *value = NULL; + struct posix_acl *acl; + + if (!fc->posix_acl || fc->no_getxattr) + return NULL; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return ERR_PTR(-EOPNOTSUPP); + + value = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + size = fuse_getxattr(inode, name, value, PAGE_SIZE); + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if ((size == 0) || (size == -ENODATA) || + (size == -EOPNOTSUPP && fc->no_getxattr)) + acl = NULL; + else if (size == -ERANGE) + acl = ERR_PTR(-E2BIG); + else + acl = ERR_PTR(size); + + kfree(value); + return acl; +} + +int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + const char *name; + int ret; + + if (!fc->posix_acl || fc->no_setxattr) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return -EINVAL; + + if (acl) { + /* + * Fuse userspace is responsible for updating access + * permissions in the inode, if needed. fuse_setxattr + * invalidates the inode attributes, which will force + * them to be refreshed the next time they are used, + * and it also updates i_ctime. + */ + size_t size = posix_acl_xattr_size(acl->a_count); + void *value; + + if (size > PAGE_SIZE) + return -E2BIG; + + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) { + kfree(value); + return ret; + } + + ret = fuse_setxattr(inode, name, value, size, 0); + kfree(value); + } else { + ret = fuse_removexattr(inode, name); + } + forget_all_cached_acls(inode); + fuse_invalidate_attr(inode); + + return ret; +} diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a94d2ed81ab4..c41bde26c338 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -767,7 +767,6 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->len = err; cs->offset = off; cs->pg = page; - cs->offset = off; iov_iter_advance(cs->iter, err); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c47b7780ce37..f7c84ab835ca 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -13,6 +13,8 @@ #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) { @@ -37,47 +39,39 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } -#if BITS_PER_LONG >= 64 +union fuse_dentry { + u64 time; + struct rcu_head rcu; +}; + static inline void fuse_dentry_settime(struct dentry *entry, u64 time) { - entry->d_time = time; + ((union fuse_dentry *) entry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(struct dentry *entry) { - return entry->d_time; -} -#else -/* - * On 32 bit archs store the high 32 bits of time in d_fsdata - */ -static void fuse_dentry_settime(struct dentry *entry, u64 time) -{ - entry->d_time = time; - entry->d_fsdata = (void *) (unsigned long) (time >> 32); -} - -static u64 fuse_dentry_time(struct dentry *entry) -{ - return (u64) entry->d_time + - ((u64) (unsigned long) entry->d_fsdata << 32); + return ((union fuse_dentry *) entry->d_fsdata)->time; } -#endif /* * FUSE caches dentries and attributes with separate timeout. The * time in jiffies until the dentry/attributes are valid is stored in - * dentry->d_time and fuse_inode->i_time respectively. + * dentry->d_fsdata and fuse_inode->i_time respectively. */ /* * Calculate the time in jiffies until a dentry/attributes are valid */ -static u64 time_to_jiffies(unsigned long sec, unsigned long nsec) +static u64 time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { - struct timespec ts = {sec, nsec}; - return get_jiffies_64() + timespec_to_jiffies(&ts); + struct timespec64 ts = { + sec, + max_t(u32, nsec, NSEC_PER_SEC - 1) + }; + + return get_jiffies_64() + timespec64_to_jiffies(&ts); } else return 0; } @@ -243,6 +237,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; + forget_all_cached_acls(inode); fuse_change_attributes(inode, &outarg.attr, entry_attr_timeout(&outarg), attr_version); @@ -272,8 +267,23 @@ static int invalid_nodeid(u64 nodeid) return !nodeid || nodeid == FUSE_ROOT_ID; } +static int fuse_dentry_init(struct dentry *dentry) +{ + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + + return dentry->d_fsdata ? 0 : -ENOMEM; +} +static void fuse_dentry_release(struct dentry *dentry) +{ + union fuse_dentry *fd = dentry->d_fsdata; + + kfree_rcu(fd, rcu); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_init = fuse_dentry_init, + .d_release = fuse_dentry_release, }; int fuse_valid_type(int m) @@ -634,7 +644,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, &args, dir, entry, S_IFLNK); } -static inline void fuse_update_ctime(struct inode *inode) +void fuse_update_ctime(struct inode *inode) { if (!IS_NOCMTIME(inode)) { inode->i_ctime = current_fs_time(inode->i_sb); @@ -917,6 +927,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, if (time_before64(fi->i_time, get_jiffies_64())) { r = true; + forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else { r = false; @@ -1017,7 +1028,7 @@ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; - if (fc->flags & FUSE_ALLOW_OTHER) + if (fc->allow_other) return 1; cred = current_cred(); @@ -1064,6 +1075,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) if (mask & MAY_NOT_BLOCK) return -ECHILD; + forget_all_cached_acls(inode); return fuse_do_getattr(inode, NULL, NULL); } @@ -1092,7 +1104,7 @@ static int fuse_permission(struct inode *inode, int mask) /* * If attributes are needed, refresh them before proceeding */ - if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) || + if (fc->default_permissions || ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -1105,7 +1117,7 @@ static int fuse_permission(struct inode *inode, int mask) } } - if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { + if (fc->default_permissions) { err = generic_permission(inode, mask); /* If permission is denied, try to refresh file @@ -1233,6 +1245,7 @@ retry: fi->nlookup++; spin_unlock(&fc->lock); + forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), attr_version); @@ -1605,7 +1618,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, int err; bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; err = inode_change_ok(inode, attr); @@ -1702,172 +1715,75 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); - - if (!fuse_allow_current_process(get_fuse_conn(inode))) - return -EACCES; - - if (attr->ia_valid & ATTR_FILE) - return fuse_do_setattr(inode, attr, attr->ia_file); - else - return fuse_do_setattr(inode, attr, NULL); -} - -static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, - struct kstat *stat) -{ - struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); + struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; + int ret; - if (!fuse_allow_current_process(fc)) + if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; - return fuse_update_attributes(inode, stat, NULL, NULL); -} - -static int fuse_setxattr(struct dentry *unused, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - struct fuse_setxattr_in inarg; - int err; - - if (fc->no_setxattr) - return -EOPNOTSUPP; + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { + attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | + ATTR_MODE); - memset(&inarg, 0, sizeof(inarg)); - inarg.size = size; - inarg.flags = flags; - args.in.h.opcode = FUSE_SETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 3; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - args.in.args[2].size = size; - args.in.args[2].value = value; - err = fuse_simple_request(fc, &args); - if (err == -ENOSYS) { - fc->no_setxattr = 1; - err = -EOPNOTSUPP; - } - if (!err) { - fuse_invalidate_attr(inode); - fuse_update_ctime(inode); + /* + * The only sane way to reliably kill suid/sgid is to do it in + * the userspace filesystem + * + * This should be done on write(), truncate() and chown(). + */ + if (!fc->handle_killpriv) { + int kill; + + /* + * ia_mode calculation may have used stale i_mode. + * Refresh and recalculate. + */ + ret = fuse_do_getattr(inode, NULL, file); + if (ret) + return ret; + + attr->ia_mode = inode->i_mode; + kill = should_remove_suid(entry); + if (kill & ATTR_KILL_SUID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISUID; + } + if (kill & ATTR_KILL_SGID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISGID; + } + } } - return err; -} - -static ssize_t fuse_getxattr(struct dentry *entry, struct inode *inode, - const char *name, void *value, size_t size) -{ - struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - struct fuse_getxattr_in inarg; - struct fuse_getxattr_out outarg; - ssize_t ret; + if (!attr->ia_valid) + return 0; - if (fc->no_getxattr) - return -EOPNOTSUPP; + ret = fuse_do_setattr(inode, attr, file); + if (!ret) { + /* + * If filesystem supports acls it may have updated acl xattrs in + * the filesystem, so forget cached acls for the inode. + */ + if (fc->posix_acl) + forget_all_cached_acls(inode); - memset(&inarg, 0, sizeof(inarg)); - inarg.size = size; - args.in.h.opcode = FUSE_GETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - /* This is really two different operations rolled into one */ - args.out.numargs = 1; - if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = value; - } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - } - ret = fuse_simple_request(fc, &args); - if (!ret && !size) - ret = outarg.size; - if (ret == -ENOSYS) { - fc->no_getxattr = 1; - ret = -EOPNOTSUPP; + /* Directory mode changed, may need to revalidate access */ + if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) + fuse_invalidate_entry_cache(entry); } return ret; } -static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, + struct kstat *stat) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - struct fuse_getxattr_in inarg; - struct fuse_getxattr_out outarg; - ssize_t ret; if (!fuse_allow_current_process(fc)) return -EACCES; - if (fc->no_listxattr) - return -EOPNOTSUPP; - - memset(&inarg, 0, sizeof(inarg)); - inarg.size = size; - args.in.h.opcode = FUSE_LISTXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - /* This is really two different operations rolled into one */ - args.out.numargs = 1; - if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = list; - } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - } - ret = fuse_simple_request(fc, &args); - if (!ret && !size) - ret = outarg.size; - if (ret == -ENOSYS) { - fc->no_listxattr = 1; - ret = -EOPNOTSUPP; - } - return ret; -} - -static int fuse_removexattr(struct dentry *entry, const char *name) -{ - struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - int err; - - if (fc->no_removexattr) - return -EOPNOTSUPP; - - args.in.h.opcode = FUSE_REMOVEXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = strlen(name) + 1; - args.in.args[0].value = name; - err = fuse_simple_request(fc, &args); - if (err == -ENOSYS) { - fc->no_removexattr = 1; - err = -EOPNOTSUPP; - } - if (!err) { - fuse_invalidate_attr(inode); - fuse_update_ctime(inode); - } - return err; + return fuse_update_attributes(inode, stat, NULL, NULL); } static const struct inode_operations fuse_dir_inode_operations = { @@ -1884,10 +1800,12 @@ static const struct inode_operations fuse_dir_inode_operations = { .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, - .setxattr = fuse_setxattr, - .getxattr = fuse_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = fuse_listxattr, - .removexattr = fuse_removexattr, + .removexattr = generic_removexattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, }; static const struct file_operations fuse_dir_operations = { @@ -1905,10 +1823,12 @@ static const struct inode_operations fuse_common_inode_operations = { .setattr = fuse_setattr, .permission = fuse_permission, .getattr = fuse_getattr, - .setxattr = fuse_setxattr, - .getxattr = fuse_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = fuse_listxattr, - .removexattr = fuse_removexattr, + .removexattr = generic_removexattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, }; static const struct inode_operations fuse_symlink_inode_operations = { @@ -1916,10 +1836,10 @@ static const struct inode_operations fuse_symlink_inode_operations = { .get_link = fuse_get_link, .readlink = generic_readlink, .getattr = fuse_getattr, - .setxattr = fuse_setxattr, - .getxattr = fuse_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = fuse_listxattr, - .removexattr = fuse_removexattr, + .removexattr = generic_removexattr, }; void fuse_init_common(struct inode *inode) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f394aff59c36..b7beb67bf005 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -530,13 +530,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, int write) +static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) { unsigned i; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; - if (write) + if (should_dirty) set_page_dirty_lock(page); put_page(page); } @@ -1320,6 +1320,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; + bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1363,7 +1364,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, !write); + fuse_release_user_pages(req, should_dirty); if (req->out.h.error) { err = req->out.h.error; break; @@ -2325,49 +2326,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) return retval; } -static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, - unsigned int nr_segs, size_t bytes, bool to_user) -{ - struct iov_iter ii; - int page_idx = 0; - - if (!bytes) - return 0; - - iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes); - - while (iov_iter_count(&ii)) { - struct page *page = pages[page_idx++]; - size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); - void *kaddr; - - kaddr = kmap(page); - - while (todo) { - char __user *uaddr = ii.iov->iov_base + ii.iov_offset; - size_t iov_len = ii.iov->iov_len - ii.iov_offset; - size_t copy = min(todo, iov_len); - size_t left; - - if (!to_user) - left = copy_from_user(kaddr, uaddr, copy); - else - left = copy_to_user(uaddr, kaddr, copy); - - if (unlikely(left)) - return -EFAULT; - - iov_iter_advance(&ii, copy); - todo -= copy; - kaddr += copy; - } - - kunmap(page); - } - - return 0; -} - /* * CUSE servers compiled on 32bit broke on 64bit kernels because the * ABI was defined to be 'struct iovec' which is different on 32bit @@ -2519,8 +2477,9 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; - size_t in_size, out_size, transferred; - int err; + size_t in_size, out_size, transferred, c; + int err, i; + struct iov_iter ii; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; @@ -2602,10 +2561,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, req->in.args[1].size = in_size; req->in.argpages = 1; - err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, - false); - if (err) - goto out; + err = -EFAULT; + iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { + c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } } req->out.numargs = 2; @@ -2671,7 +2633,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, if (transferred > inarg.out_size) goto out; - err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); + err = -EFAULT; + iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { + c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + err = 0; out: if (req) fuse_put_request(fc, req); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d98d8cc84def..24ada5dc4dae 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -23,6 +23,7 @@ #include <linux/poll.h> #include <linux/workqueue.h> #include <linux/kref.h> +#include <linux/xattr.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -36,15 +37,6 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 -/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem - module will check permissions based on the file mode. Otherwise no - permission checking is done in the kernel */ -#define FUSE_DEFAULT_PERMISSIONS (1 << 0) - -/** If the FUSE_ALLOW_OTHER flag is given, then not only the user - doing the mount will be allowed to access the filesystem */ -#define FUSE_ALLOW_OTHER (1 << 1) - /** Number of page pointers embedded in fuse_req */ #define FUSE_REQ_INLINE_PAGES 1 @@ -469,9 +461,6 @@ struct fuse_conn { /** The group id for this mount */ kgid_t group_id; - /** The fuse mount flags for this mount */ - unsigned flags; - /** Maximum read size */ unsigned max_read; @@ -547,6 +536,9 @@ struct fuse_conn { /** allow parallel lookups and readdir (default is serialized) */ unsigned parallel_dirops:1; + /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ + unsigned handle_killpriv:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -624,6 +616,15 @@ struct fuse_conn { /** Is lseek not implemented by fs? */ unsigned no_lseek:1; + /** Does the filesystem support posix acls? */ + unsigned posix_acl:1; + + /** Check permissions based on the file mode or not? */ + unsigned default_permissions:1; + + /** Allow other than the mounter user to access the filesystem ? */ + unsigned allow_other:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -902,6 +903,8 @@ int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); +void fuse_update_ctime(struct inode *inode); + int fuse_update_attributes(struct inode *inode, struct kstat *stat, struct file *file, bool *refreshed); @@ -966,4 +969,17 @@ void fuse_set_initialized(struct fuse_conn *fc); void fuse_unlock_inode(struct inode *inode); void fuse_lock_inode(struct inode *inode); +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags); +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size); +ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); +int fuse_removexattr(struct inode *inode, const char *name); +extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler *fuse_acl_xattr_handlers[]; + +struct posix_acl; +struct posix_acl *fuse_get_acl(struct inode *inode, int type); +int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4e05b51120f4..17141099f2e7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -20,6 +20,7 @@ #include <linux/random.h> #include <linux/sched.h> #include <linux/exportfs.h> +#include <linux/posix_acl.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -66,7 +67,8 @@ struct fuse_mount_data { unsigned rootmode_present:1; unsigned user_id_present:1; unsigned group_id_present:1; - unsigned flags; + unsigned default_permissions:1; + unsigned allow_other:1; unsigned max_read; unsigned blksize; }; @@ -192,7 +194,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, * check in may_delete(). */ fi->orig_i_mode = inode->i_mode; - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + if (!fc->default_permissions) inode->i_mode &= ~S_ISVTX; fi->orig_ino = attr->ino; @@ -340,6 +342,7 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, return -ENOENT; fuse_invalidate_attr(inode); + forget_all_cached_acls(inode); if (offset >= 0) { pg_start = offset >> PAGE_SHIFT; if (len <= 0) @@ -532,11 +535,11 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_DEFAULT_PERMISSIONS: - d->flags |= FUSE_DEFAULT_PERMISSIONS; + d->default_permissions = 1; break; case OPT_ALLOW_OTHER: - d->flags |= FUSE_ALLOW_OTHER; + d->allow_other = 1; break; case OPT_MAX_READ: @@ -570,9 +573,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); - if (fc->flags & FUSE_DEFAULT_PERMISSIONS) + if (fc->default_permissions) seq_puts(m, ",default_permissions"); - if (fc->flags & FUSE_ALLOW_OTHER) + if (fc->allow_other) seq_puts(m, ",allow_other"); if (fc->max_read != ~0) seq_printf(m, ",max_read=%u", fc->max_read); @@ -910,8 +913,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->writeback_cache = 1; if (arg->flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; + if (arg->flags & FUSE_HANDLE_KILLPRIV) + fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fc->sb->s_time_gran = arg->time_gran; + if ((arg->flags & FUSE_POSIX_ACL)) { + fc->default_permissions = 1; + fc->posix_acl = 1; + fc->sb->s_xattr = fuse_acl_xattr_handlers; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -941,7 +951,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | - FUSE_PARALLEL_DIROPS; + FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1071,6 +1081,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) } sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; + sb->s_xattr = fuse_xattr_handlers; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; @@ -1109,7 +1120,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= MS_POSIXACL; - fc->flags = d.flags; + fc->default_permissions = d.default_permissions; + fc->allow_other = d.allow_other; fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c new file mode 100644 index 000000000000..3caac46b08b0 --- /dev/null +++ b/fs/fuse/xattr.c @@ -0,0 +1,211 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2016 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + */ + +#include "fuse_i.h" + +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> + +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_setxattr_in inarg; + int err; + + if (fc->no_setxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + inarg.flags = flags; + args.in.h.opcode = FUSE_SETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 3; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + args.in.args[2].size = size; + args.in.args[2].value = value; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_setxattr = 1; + err = -EOPNOTSUPP; + } + if (!err) { + fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } + return err; +} + +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (fc->no_getxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.in.h.opcode = FUSE_GETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + /* This is really two different operations rolled into one */ + args.out.numargs = 1; + if (size) { + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = value; + } else { + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); + if (ret == -ENOSYS) { + fc->no_getxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +static int fuse_verify_xattr_list(char *list, size_t size) +{ + size_t origsize = size; + + while (size) { + size_t thislen = strnlen(list, size); + + if (!thislen || thislen == size) + return -EIO; + + size -= thislen + 1; + list += thislen + 1; + } + + return origsize; +} + +ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + if (fc->no_listxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.in.h.opcode = FUSE_LISTXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + /* This is really two different operations rolled into one */ + args.out.numargs = 1; + if (size) { + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = list; + } else { + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); + if (ret > 0 && size) + ret = fuse_verify_xattr_list(list, ret); + if (ret == -ENOSYS) { + fc->no_listxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +int fuse_removexattr(struct inode *inode, const char *name) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + int err; + + if (fc->no_removexattr) + return -EOPNOTSUPP; + + args.in.h.opcode = FUSE_REMOVEXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = strlen(name) + 1; + args.in.args[0].value = name; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_removexattr = 1; + err = -EOPNOTSUPP; + } + if (!err) { + fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } + return err; +} + +static int fuse_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) +{ + return fuse_getxattr(inode, name, value, size); +} + +static int fuse_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + if (!value) + return fuse_removexattr(inode, name); + + return fuse_setxattr(inode, name, value, size, flags); +} + +static const struct xattr_handler fuse_xattr_handler = { + .prefix = "", + .get = fuse_xattr_get, + .set = fuse_xattr_set, +}; + +const struct xattr_handler *fuse_xattr_handlers[] = { + &fuse_xattr_handler, + NULL +}; + +const struct xattr_handler *fuse_acl_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &fuse_xattr_handler, + NULL +}; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 82df36886938..5a6f52ea2722 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -187,7 +187,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w ClearPageChecked(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + BIT(BH_Dirty)|BIT(BH_Uptodate)); } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); } @@ -1147,6 +1147,16 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) if (!page_has_buffers(page)) return 0; + /* + * From xfs_vm_releasepage: mm accommodates an old ext3 case where + * clean pages might not have had the dirty bit cleared. Thus, it can + * send actual dirty pages to ->releasepage() via shrink_active_list(). + * + * As a workaround, we skip pages that contain dirty buffers below. + * Once ->releasepage isn't called on dirty pages anymore, we can warn + * on dirty buffers like we used to here again. + */ + gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); @@ -1156,8 +1166,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bd = bh->b_private; if (bd && bd->bd_tr) goto cannot_release; - if (buffer_pinned(bh) || buffer_dirty(bh)) - goto not_possible; + if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh))) + goto cannot_release; bh = bh->b_this_page; } while(bh != head); spin_unlock(&sdp->sd_ail_lock); @@ -1180,9 +1190,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return try_to_free_buffers(page); -not_possible: /* Should never happen */ - WARN_ON(buffer_dirty(bh)); - WARN_ON(buffer_pinned(bh)); cannot_release: spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6e2bec1cd289..645721f3ff00 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -82,8 +82,8 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, } if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, - (1 << BH_Uptodate)); + create_empty_buffers(page, BIT(inode->i_blkbits), + BIT(BH_Uptodate)); bh = page_buffers(page); @@ -690,7 +690,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi BUG_ON(!dblock); BUG_ON(!new); - bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5)); + bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5)); ret = gfs2_block_map(inode, lblock, &bh, create); *extlen = bh.b_size >> inode->i_blkbits; *dblock = bh.b_blocknr; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index fcb59b23f1e3..db8fbeb62483 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -351,7 +351,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) if (hc) return hc; - hsize = 1 << ip->i_depth; + hsize = BIT(ip->i_depth); hsize *= sizeof(__be64); if (hsize != i_size_read(&ip->i_inode)) { gfs2_consist_inode(ip); @@ -819,8 +819,8 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, if (ip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf; - unsigned hsize = 1 << ip->i_depth; - unsigned index; + unsigned int hsize = BIT(ip->i_depth); + unsigned int index; u64 ln; if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(ip); @@ -932,7 +932,7 @@ static int dir_make_exhash(struct inode *inode) return -ENOSPC; bn = bh->b_blocknr; - gfs2_assert(sdp, dip->i_entries < (1 << 16)); + gfs2_assert(sdp, dip->i_entries < BIT(16)); leaf->lf_entries = cpu_to_be16(dip->i_entries); /* Copy dirents */ @@ -1041,7 +1041,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) bn = nbh->b_blocknr; /* Compute the start and len of leaf pointers in the hash table. */ - len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); + len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth)); half_len = len >> 1; if (!half_len) { pr_warn("i_depth %u lf_depth %u index %u\n", @@ -1163,7 +1163,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) int x; int error = 0; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); hsize_bytes = hsize * sizeof(__be64); hc = gfs2_dir_get_hash_table(dip); @@ -1539,7 +1539,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, int error = 0; unsigned depth = 0; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); hash = gfs2_dir_offset2hash(ctx->pos); index = hash >> (32 - dip->i_depth); @@ -1558,7 +1558,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, if (error) break; - len = 1 << (dip->i_depth - depth); + len = BIT(dip->i_depth - depth); index = (index & ~(len - 1)) + len; } @@ -2113,7 +2113,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) u64 leaf_no; int error = 0, last; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); lp = gfs2_dir_get_hash_table(dip); if (IS_ERR(lp)) @@ -2126,7 +2126,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) if (error) goto out; leaf = (struct gfs2_leaf *)bh->b_data; - len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); + len = BIT(dip->i_depth - be16_to_cpu(leaf->lf_depth)); next_index = (index & ~(len - 1)) + len; last = ((next_index >= hsize) ? 1 : 0); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 320e65e61938..360188f162bd 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -395,9 +395,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); - /* Update file times before taking page lock */ - file_update_time(vma->vm_file); - ret = gfs2_rsqa_alloc(ip); if (ret) goto out; @@ -409,6 +406,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_uninit; + /* Update file times before taking page lock */ + file_update_time(vma->vm_file); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 3a90b2b5b9bb..14cbf60167a7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -69,7 +69,7 @@ static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 -#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) +#define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) static struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, @@ -1781,7 +1781,13 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - register_shrinker(&glock_shrinker); + ret = register_shrinker(&glock_shrinker); + if (ret) { + destroy_workqueue(gfs2_delete_workqueue); + destroy_workqueue(glock_workqueue); + rhashtable_destroy(&gl_hash_table); + return ret; + } return 0; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e4da0ecd3285..fb3a810b506f 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -187,6 +187,10 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } gfs2_set_iop(inode); + + inode->i_atime.tv_sec = 0; + inode->i_atime.tv_nsec = 0; + unlock_new_inode(inode); } diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 7710dfd3af35..aace8ce34a18 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -85,7 +85,7 @@ static inline int gfs2_check_internal_file_size(struct inode *inode, u64 size = i_size_read(inode); if (size < minsize || size > maxsize) goto err; - if (size & ((1 << inode->i_blkbits) - 1)) + if (size & (BIT(inode->i_blkbits) - 1)) goto err; return 0; err: diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 74fd0139e6c2..67d1fc4668f7 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -145,7 +145,9 @@ static int __init init_gfs2_fs(void) if (!gfs2_qadata_cachep) goto fail; - register_shrinker(&gfs2_qd_shrinker); + error = register_shrinker(&gfs2_qd_shrinker); + if (error) + goto fail; error = register_filesystem(&gfs2_fs_type); if (error) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 950b8be68e41..373639a59782 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -216,23 +216,26 @@ static void gfs2_meta_read_endio(struct bio *bio) static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], int num) { - struct buffer_head *bh = bhs[0]; - struct bio *bio; - int i; - - if (!num) - return; - - bio = bio_alloc(GFP_NOIO, num); - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - for (i = 0; i < num; i++) { - bh = bhs[i]; - bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + while (num > 0) { + struct buffer_head *bh = *bhs; + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + while (num > 0) { + bh = *bhs; + if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { + BUG_ON(bio->bi_iter.bi_size == 0); + break; + } + bhs++; + num--; + } + bio->bi_end_io = gfs2_meta_read_endio; + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); } - bio->bi_end_io = gfs2_meta_read_endio; - bio_set_op_attrs(bio, op, op_flags); - submit_bio(bio); } /** diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ef1e1822977f..ff72ac6439c8 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -58,7 +58,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_quota_scale_num = 1; gt->gt_quota_scale_den = 1; gt->gt_new_files_jdata = 0; - gt->gt_max_readahead = 1 << 18; + gt->gt_max_readahead = BIT(18); gt->gt_complain_secs = 10; } @@ -284,7 +284,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) / sizeof(u64); sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - @@ -302,7 +302,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) /* Compute maximum reservation required to add a entry to a directory */ - hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + hash_blocks = DIV_ROUND_UP(sizeof(u64) * BIT(GFS2_DIR_MAX_DEPTH), sdp->sd_jbsize); ind_blocks = 0; @@ -1089,7 +1089,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 77930ca25303..8af2dfa09236 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -75,7 +75,7 @@ #include "util.h" #define GFS2_QD_HASH_SHIFT 12 -#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT) #define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) /* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ @@ -384,7 +384,7 @@ static int bh_get(struct gfs2_quota_data *qd) block = qd->qd_slot / sdp->sd_qc_per_block; offset = qd->qd_slot % sdp->sd_qc_per_block; - bh_map.b_size = 1 << ip->i_inode.i_blkbits; + bh_map.b_size = BIT(ip->i_inode.i_blkbits); error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); if (error) goto fail; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 3a7e60bb39f8..e3ee387a6dfe 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -359,7 +359,7 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); u64 size = i_size_read(jd->jd_inode); - if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, BIT(30))) return -EIO; jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; diff --git a/fs/ioctl.c b/fs/ioctl.c index 0f56deb24ce6..c415668c86d4 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -568,7 +568,7 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static long ioctl_file_dedupe_range(struct file *file, void __user *arg) +static int ioctl_file_dedupe_range(struct file *file, void __user *arg) { struct file_dedupe_range __user *argp = arg; struct file_dedupe_range *same = NULL; @@ -582,6 +582,10 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg) } size = offsetof(struct file_dedupe_range __user, info[count]); + if (size > PAGE_SIZE) { + ret = -ENOMEM; + goto out; + } same = memdup_user(argp, size); if (IS_ERR(same)) { diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 2e58978d6f45..4d973524c887 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2893,8 +2893,7 @@ restart: * on anon_list2. Let's check. */ if (!list_empty(&TxAnchor.anon_list2)) { - list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list); - INIT_LIST_HEAD(&TxAnchor.anon_list2); + list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); goto restart; } TXN_UNLOCK(); diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 90b3bc21e9b0..bd9b641ada2c 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -379,8 +379,14 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) * cached in meta-data cache, and not written out * by txCommit(); */ - filemap_fdatawait(ipbmap->i_mapping); - filemap_write_and_wait(ipbmap->i_mapping); + rc = filemap_fdatawait(ipbmap->i_mapping); + if (rc) + goto error_out; + + rc = filemap_write_and_wait(ipbmap->i_mapping); + if (rc) + goto error_out; + diWriteSpecial(ipbmap, 0); newPage = nPages; /* first new page number */ diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e1574008adc9..2bcb86e6e6ca 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -840,21 +840,35 @@ repeat: mutex_lock(&kernfs_mutex); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { + struct kernfs_node *parent; struct inode *inode; - struct dentry *dentry; + /* + * We want fsnotify_modify() on @kn but as the + * modifications aren't originating from userland don't + * have the matching @file available. Look up the inodes + * and generate the events manually. + */ inode = ilookup(info->sb, kn->ino); if (!inode) continue; - dentry = d_find_any_alias(inode); - if (dentry) { - fsnotify_parent(NULL, dentry, FS_MODIFY); - fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, - NULL, 0); - dput(dentry); + parent = kernfs_get_parent(kn); + if (parent) { + struct inode *p_inode; + + p_inode = ilookup(info->sb, parent->ino); + if (p_inode) { + fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, + inode, FSNOTIFY_EVENT_INODE, kn->name, 0); + iput(p_inode); + } + + kernfs_put(parent); } + fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, + kn->name, 0); iput(inode); } diff --git a/fs/locks.c b/fs/locks.c index ee1b15f6fc13..90ec67108b22 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -127,7 +127,6 @@ #include <linux/pid_namespace.h> #include <linux/hashtable.h> #include <linux/percpu.h> -#include <linux/lglock.h> #define CREATE_TRACE_POINTS #include <trace/events/filelock.h> @@ -158,12 +157,18 @@ int lease_break_time = 45; /* * The global file_lock_list is only used for displaying /proc/locks, so we - * keep a list on each CPU, with each list protected by its own spinlock via - * the file_lock_lglock. Note that alterations to the list also require that - * the relevant flc_lock is held. + * keep a list on each CPU, with each list protected by its own spinlock. + * Global serialization is done using file_rwsem. + * + * Note that alterations to the list also require that the relevant flc_lock is + * held. */ -DEFINE_STATIC_LGLOCK(file_lock_lglock); -static DEFINE_PER_CPU(struct hlist_head, file_lock_list); +struct file_lock_list_struct { + spinlock_t lock; + struct hlist_head hlist; +}; +static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); +DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. @@ -587,15 +592,23 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) /* Must be called with the flc_lock held! */ static void locks_insert_global_locks(struct file_lock *fl) { - lg_local_lock(&file_lock_lglock); + struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list); + + percpu_rwsem_assert_held(&file_rwsem); + + spin_lock(&fll->lock); fl->fl_link_cpu = smp_processor_id(); - hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list)); - lg_local_unlock(&file_lock_lglock); + hlist_add_head(&fl->fl_link, &fll->hlist); + spin_unlock(&fll->lock); } /* Must be called with the flc_lock held! */ static void locks_delete_global_locks(struct file_lock *fl) { + struct file_lock_list_struct *fll; + + percpu_rwsem_assert_held(&file_rwsem); + /* * Avoid taking lock if already unhashed. This is safe since this check * is done while holding the flc_lock, and new insertions into the list @@ -603,9 +616,11 @@ static void locks_delete_global_locks(struct file_lock *fl) */ if (hlist_unhashed(&fl->fl_link)) return; - lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu); + + fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu); + spin_lock(&fll->lock); hlist_del_init(&fl->fl_link); - lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu); + spin_unlock(&fll->lock); } static unsigned long @@ -915,6 +930,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) return -ENOMEM; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -955,6 +971,7 @@ find_conflict: out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); @@ -991,6 +1008,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If @@ -1162,6 +1180,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, } out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); /* * Free any unused locks. */ @@ -1436,6 +1455,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) return error; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1487,9 +1507,13 @@ restart: locks_insert_block(fl, new_fl); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); + + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); @@ -1506,6 +1530,7 @@ restart: } out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; @@ -1660,6 +1685,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr return -EINVAL; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); error = check_conflicting_open(dentry, arg, lease->fl_flags); @@ -1730,6 +1756,7 @@ out_setup: lease->fl_lmops->lm_setup(lease, priv); out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); if (is_deleg) inode_unlock(inode); @@ -1752,6 +1779,7 @@ static int generic_delete_lease(struct file *filp, void *owner) return error; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file == filp && @@ -1764,6 +1792,7 @@ static int generic_delete_lease(struct file *filp, void *owner) if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); return error; } @@ -2574,9 +2603,20 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, struct inode *inode = NULL; unsigned int fl_pid; - if (fl->fl_nspid) - fl_pid = pid_vnr(fl->fl_nspid); - else + if (fl->fl_nspid) { + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; + + /* Don't let fl_pid change based on who is reading the file */ + fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns); + + /* + * If there isn't a fl_pid don't display who is waiting on + * the lock if we are called from locks_show, or if we are + * called from __show_fd_info - skip lock entirely + */ + if (fl_pid == 0) + return; + } else fl_pid = fl->fl_pid; if (fl->fl_file != NULL) @@ -2648,9 +2688,13 @@ static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; struct file_lock *fl, *bfl; + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; fl = hlist_entry(v, struct file_lock, fl_link); + if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns)) + return 0; + lock_get_status(f, fl, iter->li_pos, ""); list_for_each_entry(bfl, &fl->fl_block, fl_block) @@ -2703,9 +2747,9 @@ static void *locks_start(struct seq_file *f, loff_t *pos) struct locks_iterator *iter = f->private; iter->li_pos = *pos + 1; - lg_global_lock(&file_lock_lglock); + percpu_down_write(&file_rwsem); spin_lock(&blocked_lock_lock); - return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos); + return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) @@ -2713,14 +2757,14 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos) struct locks_iterator *iter = f->private; ++iter->li_pos; - return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos); + return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos); } static void locks_stop(struct seq_file *f, void *v) __releases(&blocked_lock_lock) { spin_unlock(&blocked_lock_lock); - lg_global_unlock(&file_lock_lglock); + percpu_up_write(&file_rwsem); } static const struct seq_operations locks_seq_operations = { @@ -2761,10 +2805,13 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); - lg_lock_init(&file_lock_lglock, "file_lock_lglock"); - for_each_possible_cpu(i) - INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i)); + for_each_possible_cpu(i) { + struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); + + spin_lock_init(&fll->lock); + INIT_HLIST_HEAD(&fll->hlist); + } return 0; } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index f55a4e756047..217847679f0e 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -346,7 +346,7 @@ static void bl_write_cleanup(struct work_struct *work) PAGE_SIZE - 1) & (loff_t)PAGE_MASK; ext_tree_mark_written(bl, start >> SECTOR_SHIFT, - (end - start) >> SECTOR_SHIFT); + (end - start) >> SECTOR_SHIFT, end); } pnfs_ld_write_done(hdr); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 18e6fd0b9506..efc007f00742 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -141,6 +141,7 @@ struct pnfs_block_layout { struct rb_root bl_ext_ro; spinlock_t bl_ext_lock; /* Protects list manipulation */ bool bl_scsi_layout; + u64 bl_lwb; }; static inline struct pnfs_block_layout * @@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl, int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, sector_t end); int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, - sector_t len); + sector_t len, u64 lwb); bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, struct pnfs_block_extent *ret, bool rw); int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 992bcb19c11e..c85fbfd2d0d9 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, - sector_t len) + sector_t len, u64 lwb) { struct rb_root *root = &bl->bl_ext_rw; sector_t end = start + len; @@ -471,6 +471,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, } } out: + if (bl->bl_lwb < lwb) + bl->bl_lwb = lwb; spin_unlock(&bl->bl_ext_lock); __ext_put_deviceids(&tmp); @@ -518,7 +520,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) } static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, - size_t buffer_size, size_t *count) + size_t buffer_size, size_t *count, __u64 *lastbyte) { struct pnfs_block_extent *be; int ret = 0; @@ -542,6 +544,8 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, p = encode_block_extent(be, p); be->be_tag = EXTENT_COMMITTING; } + *lastbyte = bl->bl_lwb - 1; + bl->bl_lwb = 0; spin_unlock(&bl->bl_ext_lock); return ret; @@ -564,7 +568,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) arg->layoutupdate_pages = &arg->layoutupdate_page; retry: - ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten); if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a7f2e6e33305..52a28311e2a4 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -275,6 +275,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, err_socks: svc_rpcb_cleanup(serv, net); err_bind: + nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %p\n", ret, net); return ret; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c92a75e066a6..f953ef6b2f2e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -454,11 +454,8 @@ static bool referring_call_exists(struct nfs_client *clp, ((u32 *)&rclist->rcl_sessionid.data)[3], ref->rc_sequenceid, ref->rc_slotid); - spin_lock(&tbl->slot_tbl_lock); - status = (test_bit(ref->rc_slotid, tbl->used_slots) && - tbl->slots[ref->rc_slotid].seq_nr == - ref->rc_sequenceid); - spin_unlock(&tbl->slot_tbl_lock); + status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid, + ref->rc_sequenceid, HZ >> 1) < 0; if (status) goto out; } @@ -487,7 +484,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, goto out; tbl = &clp->cl_session->bc_slot_table; - slot = tbl->slots + args->csa_slotid; /* Set up res before grabbing the spinlock */ memcpy(&res->csr_sessionid, &args->csa_sessionid, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 003ebce4bbc4..1e106780a237 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -426,7 +426,7 @@ EXPORT_SYMBOL_GPL(nfs_mark_client_ready); * Initialise the timeout values for a connection */ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, - unsigned int timeo, unsigned int retrans) + int timeo, int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; @@ -434,9 +434,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: - if (to->to_retries == 0) + if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (to->to_initval == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; @@ -449,9 +449,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: - if (to->to_retries == 0) + if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; - if (!to->to_initval) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7d620970f2e1..ca699ddc11c1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -657,7 +657,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result <= 0) goto out; - written = generic_write_sync(iocb, result); + result = generic_write_sync(iocb, result); + if (result < 0) + goto out; + written = result; iocb->ki_pos += written; /* Return error values */ diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index e6206eaf2bdf..51b51369704c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -37,6 +37,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) if (ffl) { INIT_LIST_HEAD(&ffl->error_list); INIT_LIST_HEAD(&ffl->mirrors); + ffl->last_report_time = ktime_get(); return &ffl->generic_hdr; } else return NULL; @@ -640,19 +641,18 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, { static const ktime_t notime = {0}; s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL; + struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); if (ktime_equal(mirror->start_time, notime)) mirror->start_time = now; - if (ktime_equal(mirror->last_report_time, notime)) - mirror->last_report_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; else if (layoutstats_timer != 0) report_interval = (s64)layoutstats_timer * 1000LL; - if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= + if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >= report_interval) { - mirror->last_report_time = now; + ffl->last_report_time = now; return true; } @@ -806,11 +806,14 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_pnfs_ds *ds; + bool fail_return = false; int idx; /* mirrors are sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { - ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); + if (idx+1 == fls->mirror_array_cnt) + fail_return = true; + ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return); if (ds) { *best_idx = idx; return ds; @@ -859,6 +862,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs4_pnfs_ds *ds; int ds_idx; +retry: /* Use full layout for now */ if (!pgio->pg_lseg) ff_layout_pg_get_read(pgio, req, false); @@ -871,10 +875,13 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); if (!ds) { - if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) - goto out_pnfs; - else + if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; } mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); @@ -890,12 +897,6 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_read_mds(pgio); - return; - -out_pnfs: - pnfs_set_lo_fail(pgio->pg_lseg); - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; } static void @@ -909,6 +910,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int i; int status; +retry: if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -940,10 +942,13 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, for (i = 0; i < pgio->pg_mirror_count; i++) { ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); if (!ds) { - if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) - goto out_pnfs; - else + if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; } pgm = &pgio->pg_mirrors[i]; mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); @@ -956,12 +961,6 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); - return; - -out_pnfs: - pnfs_set_lo_fail(pgio->pg_lseg); - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; } static unsigned int diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 1bcdb15d0c41..3ee0c9fcea76 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -84,7 +84,6 @@ struct nfs4_ff_layout_mirror { struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; - ktime_t last_report_time; u32 report_interval; }; @@ -101,6 +100,7 @@ struct nfs4_flexfile_layout { struct pnfs_ds_commit_info commit_info; struct list_head mirrors; struct list_head error_list; /* nfs4_ff_layout_ds_err */ + ktime_t last_report_time; /* Layoutstat report times */ }; static inline struct nfs4_flexfile_layout * diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 0aa36be71fce..f7a3f6b05369 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -17,8 +17,8 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD -static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; -static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; +static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS; +static unsigned int dataserver_retrans; void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { @@ -379,7 +379,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, devid = &mirror->mirror_ds->id_node; if (ff_layout_test_devid_unavailable(devid)) - goto out; + goto out_fail; ds = mirror->mirror_ds->ds; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -405,15 +405,16 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].rsize = max_payload; if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) mirror->mirror_ds->ds_versions[0].wsize = max_payload; - } else { - ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, lseg->pls_range.offset, - lseg->pls_range.length, NFS4ERR_NXIO, - OP_ILLEGAL, GFP_NOIO); - if (fail_return || !ff_layout_has_available_ds(lseg)) - pnfs_error_mark_layout_for_return(ino, lseg); - ds = NULL; + goto out; } + ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, lseg->pls_range.offset, + lseg->pls_range.length, NFS4ERR_NXIO, + OP_ILLEGAL, GFP_NOIO); +out_fail: + if (fail_return || !ff_layout_has_available_ds(lseg)) + pnfs_error_mark_layout_for_return(ino, lseg); + ds = NULL; out: return ds; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7ce5e023c3c3..74935a19e4bf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -58,6 +58,9 @@ struct nfs_clone_mount { */ #define NFS_UNSPEC_PORT (-1) +#define NFS_UNSPEC_RETRANS (UINT_MAX) +#define NFS_UNSPEC_TIMEO (UINT_MAX) + /* * Maximum number of pages that readdir can use for creating * a vmapped array of pages. @@ -156,7 +159,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); -void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); +void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); struct nfs_server *nfs_alloc_server(void); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6f4752734804..64b43b4ad9dd 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -318,10 +318,22 @@ static void nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) { struct nfs42_layoutstat_data *data = calldata; - struct nfs_server *server = NFS_SERVER(data->args.inode); + struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo; + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + rpc_exit(task, 0); + return; + } + nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid); + spin_unlock(&inode->i_lock); nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, &data->res.seq_res, task); + } static void @@ -341,11 +353,11 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match(&data->args.stateid, + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match(&data->args.stateid, &lo->plh_stateid)) { LIST_HEAD(head); @@ -359,11 +371,23 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) } else spin_unlock(&inode->i_lock); break; + case -NFS4ERR_OLD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&data->args.stateid, + &lo->plh_stateid)) { + /* Do we need to delay before resending? */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, + &data->args.stateid)) + rpc_delay(task, HZ); + rpc_restart_call_prepare(task); + } + spin_unlock(&inode->i_lock); + break; case -ENOTSUPP: case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; - default: - break; } dprintk("%s server returns %d\n", __func__, task->tk_status); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8d7d08d4f95f..cd3b7cfdde16 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -817,6 +817,11 @@ static int nfs4_set_client(struct nfs_server *server, goto error; } + if (server->nfs_client == clp) { + error = -ELOOP; + goto error; + } + /* * Query for the lease time on clientid setup or renewal * diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1949bbd806eb..a9dec32ba9ba 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -634,15 +634,11 @@ out_sleep: } EXPORT_SYMBOL_GPL(nfs40_setup_sequence); -static int nfs40_sequence_done(struct rpc_task *task, - struct nfs4_sequence_res *res) +static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot *slot = res->sr_slot; struct nfs4_slot_table *tbl; - if (slot == NULL) - goto out; - tbl = slot->table; spin_lock(&tbl->slot_tbl_lock); if (!nfs41_wake_and_assign_slot(tbl, slot)) @@ -650,7 +646,13 @@ static int nfs40_sequence_done(struct rpc_task *task, spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; -out: +} + +static int nfs40_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) + nfs40_sequence_free_slot(res); return 1; } @@ -666,6 +668,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) tbl = slot->table; session = tbl->session; + /* Bump the slot sequence number */ + if (slot->seq_done) + slot->seq_nr++; + slot->seq_done = 0; + spin_lock(&tbl->slot_tbl_lock); /* Be nice to the server: try to ensure that the last transmitted * value for highest_user_slotid <= target_highest_slotid @@ -686,9 +693,12 @@ out_unlock: res->sr_slot = NULL; if (send_new_highest_used_slotid) nfs41_notify_server(session->clp); + if (waitqueue_active(&tbl->slot_waitq)) + wake_up_all(&tbl->slot_waitq); } -int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +static int nfs41_sequence_process(struct rpc_task *task, + struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; @@ -714,7 +724,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) switch (res->sr_status) { case 0: /* Update the slot's sequence and clientid lease timer */ - ++slot->seq_nr; + slot->seq_done = 1; clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ @@ -769,16 +779,16 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) goto retry_nowait; default: /* Just update the slot sequence no. */ - ++slot->seq_nr; + slot->seq_done = 1; } out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); - nfs41_sequence_free_slot(res); out_noaction: return ret; retry_nowait: if (rpc_restart_call_prepare(task)) { + nfs41_sequence_free_slot(res); task->tk_status = 0; ret = 0; } @@ -789,8 +799,37 @@ out_retry: rpc_delay(task, NFS4_POLL_RETRY_MAX); return 0; } + +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + if (!nfs41_sequence_process(task, res)) + return 0; + if (res->sr_slot != NULL) + nfs41_sequence_free_slot(res); + return 1; + +} EXPORT_SYMBOL_GPL(nfs41_sequence_done); +static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + if (res->sr_slot == NULL) + return 1; + if (res->sr_slot->table->session != NULL) + return nfs41_sequence_process(task, res); + return nfs40_sequence_done(task, res); +} + +static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) { + if (res->sr_slot->table->session != NULL) + nfs41_sequence_free_slot(res); + else + nfs40_sequence_free_slot(res); + } +} + int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { if (res->sr_slot == NULL) @@ -920,6 +959,17 @@ static int nfs4_setup_sequence(const struct nfs_server *server, args, res, task); } +static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + return nfs40_sequence_done(task, res); +} + +static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) + nfs40_sequence_free_slot(res); +} + int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -1197,6 +1247,7 @@ static void nfs4_opendata_free(struct kref *kref) struct super_block *sb = p->dentry->d_sb; nfs_free_seqid(p->o_arg.seqid); + nfs4_sequence_free_slot(&p->o_res.seq_res); if (p->state != NULL) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); @@ -1656,9 +1707,14 @@ err: static struct nfs4_state * nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) { + struct nfs4_state *ret; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) - return _nfs4_opendata_reclaim_to_nfs4_state(data); - return _nfs4_opendata_to_nfs4_state(data); + ret =_nfs4_opendata_reclaim_to_nfs4_state(data); + else + ret = _nfs4_opendata_to_nfs4_state(data); + nfs4_sequence_free_slot(&data->o_res.seq_res); + return ret; } static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) @@ -2056,7 +2112,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; - if (!nfs4_sequence_done(task, &data->o_res.seq_res)) + if (!nfs4_sequence_process(task, &data->o_res.seq_res)) return; if (task->tk_status == 0) { @@ -7514,12 +7570,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_create_session(clp, status); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EACCES: + case -EAGAIN: + goto out; + }; + + clp->cl_seqid++; if (!status) { /* Verify the session's negotiated channel_attrs values */ status = nfs4_verify_channel_attrs(&args, &res); /* Increment the clientid slot sequence id */ - if (clp->cl_seqid == res.seqid) - clp->cl_seqid++; if (status) goto out; nfs4_update_session(session, &res); @@ -7864,7 +7928,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; dprintk("--> %s\n", __func__); - nfs41_sequence_done(task, &lgp->res.seq_res); + nfs41_sequence_process(task, &lgp->res.seq_res); dprintk("<-- %s\n", __func__); } @@ -8080,6 +8144,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); + nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) @@ -8106,7 +8171,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); - if (!nfs41_sequence_done(task, &lrp->res.seq_res)) + if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; server = NFS_SERVER(lrp->args.inode); @@ -8118,6 +8183,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) case -NFS4ERR_DELAY: if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) break; + nfs4_sequence_free_slot(&lrp->res.seq_res); rpc_restart_call_prepare(task); return; } @@ -8132,12 +8198,16 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, - be32_to_cpu(lrp->args.stateid.seqid)); - if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) + if (lrp->res.lrs_present) { + pnfs_mark_matching_lsegs_invalid(lo, &freeme, + &lrp->args.range, + be32_to_cpu(lrp->args.stateid.seqid)); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + } else + pnfs_mark_layout_stateid_invalid(lo, &freeme); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); + nfs4_sequence_free_slot(&lrp->res.seq_res); pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 332d06e64fa9..b62973045a3e 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -28,6 +28,7 @@ static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue) tbl->highest_used_slotid = NFS4_NO_SLOT; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue); + init_waitqueue_head(&tbl->slot_waitq); init_completion(&tbl->complete); } @@ -172,6 +173,58 @@ struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid) return ERR_PTR(-E2BIG); } +static int nfs4_slot_get_seqid(struct nfs4_slot_table *tbl, u32 slotid, + u32 *seq_nr) + __must_hold(&tbl->slot_tbl_lock) +{ + struct nfs4_slot *slot; + + slot = nfs4_lookup_slot(tbl, slotid); + if (IS_ERR(slot)) + return PTR_ERR(slot); + *seq_nr = slot->seq_nr; + return 0; +} + +/* + * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use + * + * Given a slot table, slot id and sequence number, determine if the + * RPC call in question is still in flight. This function is mainly + * intended for use by the callback channel. + */ +static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr) +{ + u32 cur_seq; + bool ret = false; + + spin_lock(&tbl->slot_tbl_lock); + if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 && + cur_seq == seq_nr && test_bit(slotid, tbl->used_slots)) + ret = true; + spin_unlock(&tbl->slot_tbl_lock); + return ret; +} + +/* + * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete + * + * Given a slot table, slot id and sequence number, wait until the + * corresponding RPC call completes. This function is mainly + * intended for use by the callback channel. + */ +int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr, + unsigned long timeout) +{ + if (wait_event_timeout(tbl->slot_waitq, + !nfs4_slot_seqid_in_use(tbl, slotid, seq_nr), + timeout) == 0) + return -ETIMEDOUT; + return 0; +} + /* * nfs4_alloc_slot - efficiently look for a free slot * diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 5b51298d1d03..f703b755351b 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -21,7 +21,8 @@ struct nfs4_slot { unsigned long generation; u32 slot_nr; u32 seq_nr; - unsigned int interrupted : 1; + unsigned int interrupted : 1, + seq_done : 1; }; /* Sessions */ @@ -36,6 +37,7 @@ struct nfs4_slot_table { unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ spinlock_t slot_tbl_lock; struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ + wait_queue_head_t slot_waitq; /* Completion wait on slot */ u32 max_slots; /* # slots in table */ u32 max_slotid; /* Max allowed slotid value */ u32 highest_used_slotid; /* sent to server on each SEQ. @@ -78,6 +80,9 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid); +extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr, + unsigned long timeout); extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 70806cae0d36..2c93a85eda51 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -365,7 +365,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); if (list_empty(&lo->plh_segs)) { - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + if (atomic_read(&lo->plh_outstanding) == 0) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); @@ -768,17 +769,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { u32 oldseq, newseq, new_barrier = 0; - bool invalid = !pnfs_layout_is_valid(lo); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { + + if (!pnfs_layout_is_valid(lo)) { + nfs4_stateid_copy(&lo->plh_stateid, new); + lo->plh_barrier = newseq; + pnfs_clear_layoutreturn_info(lo); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + return; + } + if (pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); /* * Because of wraparound, we want to keep the barrier @@ -790,7 +806,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, new_barrier = be32_to_cpu(new->seqid); else if (new_barrier == 0) return; - if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) lo->plh_barrier = new_barrier; } @@ -886,19 +902,14 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } -static void -pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) -{ - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); -} - static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, enum pnfs_iomode *iomode) { + /* Serialise LAYOUTGET/LAYOUTRETURN */ + if (atomic_read(&lo->plh_outstanding) != 0) + return false; if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; pnfs_get_layout_hdr(lo); @@ -1555,6 +1566,7 @@ pnfs_update_layout(struct inode *ino, } lookup_again: + nfs4_client_recover_expired_lease(clp); first = false; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); @@ -1797,16 +1809,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) */ pnfs_mark_layout_stateid_invalid(lo, &free_me); - nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); - lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + pnfs_set_layout_stateid(lo, &res->stateid, true); } pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg, &free_me); - if (!pnfs_layout_is_valid(lo)) { - pnfs_clear_layoutreturn_info(lo); - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - } if (res->return_on_close) @@ -2510,7 +2517,6 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) data->args.fh = NFS_FH(inode); data->args.inode = inode; - nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid); status = ld->prepare_layoutstats(&data->args); if (status) goto out_free; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 18d446e1a82b..d39601381adf 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -923,6 +923,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { + data->timeo = NFS_UNSPEC_TIMEO; + data->retrans = NFS_UNSPEC_RETRANS; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; @@ -1189,6 +1191,19 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option) return rc; } +static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option, + unsigned long l_bound, unsigned long u_bound) +{ + int ret; + + ret = nfs_get_option_ul(args, option); + if (ret != 0) + return ret; + if (*option < l_bound || *option > u_bound) + return -ERANGE; + return 0; +} + /* * Error-check and convert a string of mount options from user space into * a data structure. The whole mount string is processed; bad options are @@ -1352,12 +1367,12 @@ static int nfs_parse_mount_options(char *raw, mnt->bsize = option; break; case Opt_timeo: - if (nfs_get_option_ul(args, &option) || option == 0) + if (nfs_get_option_ul_bound(args, &option, 1, INT_MAX)) goto out_invalid_value; mnt->timeo = option; break; case Opt_retrans: - if (nfs_get_option_ul(args, &option) || option == 0) + if (nfs_get_option_ul_bound(args, &option, 0, INT_MAX)) goto out_invalid_value; mnt->retrans = option; break; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d2f97ecca6a5..e0e5f7c3c99f 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response || - atomic_read(&group->fanotify_data.bypass_perm)); - - if (!event->response) { /* bypass_perm set */ - /* - * Event was canceled because group is being destroyed. Remove - * it from group's event list because we are responsible for - * freeing the permission event. - */ - fsnotify_remove_event(group, &event->fae.fse); - return 0; - } + wait_event(group->fanotify_data.access_waitq, event->response); /* userspace responded, convert to something usable */ switch (event->response) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8e8e6bcd1d43..a64313868d3a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + struct fsnotify_event *fsn_event; /* - * There may be still new events arriving in the notification queue - * but since userspace cannot use fanotify fd anymore, no event can - * enter or leave access_list by now. + * Stop new events from arriving in the notification queue. since + * userspace cannot use fanotify fd anymore, no event can enter or + * leave access_list by now either. */ - spin_lock(&group->fanotify_data.access_lock); - - atomic_inc(&group->fanotify_data.bypass_perm); + fsnotify_group_stop_queueing(group); + /* + * Process all permission events on access_list and notification queue + * and simulate reply from userspace. + */ + spin_lock(&group->fanotify_data.access_lock); list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, fae.fse.list) { pr_debug("%s: found group=%p event=%p\n", __func__, group, @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file) spin_unlock(&group->fanotify_data.access_lock); /* - * Since bypass_perm is set, newly queued events will not wait for - * access response. Wake up the already sleeping ones now. - * synchronize_srcu() in fsnotify_destroy_group() will wait for all - * processes sleeping in fanotify_handle_event() waiting for access - * response and thus also for all permission events to be freed. + * Destroy all non-permission events. For permission events just + * dequeue them and set the response. They will be freed once the + * response is consumed and fanotify_get_response() returns. */ + mutex_lock(&group->notification_mutex); + while (!fsnotify_notify_queue_is_empty(group)) { + fsn_event = fsnotify_remove_first_event(group); + if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, fsn_event); + else + FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; + } + mutex_unlock(&group->notification_mutex); + + /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); #endif @@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - atomic_set(&group->fanotify_data.bypass_perm, 0); #endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: diff --git a/fs/notify/group.c b/fs/notify/group.c index 3e2dd85be5dd..b47f7cfdcaa4 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -40,6 +40,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) } /* + * Stop queueing new events for this group. Once this function returns + * fsnotify_add_event() will not add any new events to the group's queue. + */ +void fsnotify_group_stop_queueing(struct fsnotify_group *group) +{ + mutex_lock(&group->notification_mutex); + group->shutdown = true; + mutex_unlock(&group->notification_mutex); +} + +/* * Trying to get rid of a group. Remove all marks, flush all events and release * the group reference. * Note that another thread calling fsnotify_clear_marks_by_group() may still @@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_destroy_group(struct fsnotify_group *group) { + /* + * Stop queueing new events. The code below is careful enough to not + * require this but fanotify needs to stop queuing events even before + * fsnotify_destroy_group() is called and this makes the other callers + * of fsnotify_destroy_group() to see the same behavior. + */ + fsnotify_group_stop_queueing(group); + /* clear all inode marks for this group, attach them to destroy_list */ fsnotify_detach_group_marks(group); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index a95d8e037aeb..e455e83ceeeb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * Add an event to the group notification queue. The group can later pull this * event off the queue to deal with. The function returns 0 if the event was * added to the queue, 1 if the event was merged with some other queued event, - * 2 if the queue of events has overflown. + * 2 if the event was not queued - either the queue of events has overflown + * or the group is shutting down. */ int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, @@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group, mutex_lock(&group->notification_mutex); + if (group->shutdown) { + mutex_unlock(&group->notification_mutex); + return 2; + } + if (group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ @@ -126,21 +132,6 @@ queue: } /* - * Remove @event from group's notification queue. It is the responsibility of - * the caller to destroy the event. - */ -void fsnotify_remove_event(struct fsnotify_group *group, - struct fsnotify_event *event) -{ - mutex_lock(&group->notification_mutex); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - group->q_len--; - } - mutex_unlock(&group->notification_mutex); -} - -/* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 7dabbc31060e..f165f867f332 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5922,7 +5922,6 @@ bail: } static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, - handle_t *handle, struct inode *data_alloc_inode, struct buffer_head *data_alloc_bh) { @@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, struct ocfs2_truncate_log *tl; struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; + handle_t *handle; di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; i = le16_to_cpu(tl->tl_used) - 1; while (i >= 0) { + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail; + } + /* Caller has given us at least enough credits to * update the truncate log dinode */ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, @@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, } } - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_commit_trans(osb, handle); i--; } @@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; unsigned int num_to_flush; - handle_t *handle; struct inode *tl_inode = osb->osb_tl_inode; struct inode *data_alloc_inode = NULL; struct buffer_head *tl_bh = osb->osb_tl_bh; @@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_unlock; - } - - status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + status = ocfs2_replay_truncate_records(osb, data_alloc_inode, data_alloc_bh); if (status < 0) mlog_errno(status); - ocfs2_commit_trans(osb, handle); - -out_unlock: brelse(data_alloc_bh); ocfs2_inode_unlock(data_alloc_inode, 1); @@ -6413,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_unlock; - } - while (head) { if (head->free_bg) bg_blkno = head->free_bg; else bg_blkno = ocfs2_which_suballoc_group(head->free_blk, head->free_bit); + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + trace_ocfs2_free_cached_blocks( (unsigned long long)head->free_blk, head->free_bit); ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, head->free_bit, bg_blkno, 1); - if (ret) { + if (ret) mlog_errno(ret); - goto out_journal; - } - ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); - if (ret) { - mlog_errno(ret); - goto out_journal; - } + ocfs2_commit_trans(osb, handle); tmp = head; head = head->free_next; kfree(tmp); } -out_journal: - ocfs2_commit_trans(osb, handle); - out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 98d36548153d..bbb4b3e5b4ff 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1842,6 +1842,16 @@ out_commit: ocfs2_commit_trans(osb, handle); out: + /* + * The mmapped page won't be unlocked in ocfs2_free_write_ctxt(), + * even in case of error here like ENOSPC and ENOMEM. So, we need + * to unlock the target page manually to prevent deadlocks when + * retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED + * to VM code. + */ + if (wc->w_target_locked) + unlock_page(mmap_page); + ocfs2_free_write_ctxt(inode, wc); if (data_ac) { diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 94b18369b1cc..b95e7df5b76a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,9 +44,6 @@ * version here in tcp_internal.h should not need to be bumped for * filesystem locking changes. * - * New in version 12 - * - Negotiate hb timeout when storage is down. - * * New in version 11 * - Negotiation of filesystem locking in the dlm join. * @@ -78,7 +75,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 12ULL +#define O2NET_PROTOCOL_VERSION 11ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index cdeafb4e7ed6..0bb128659d4b 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags, int type) { enum dlm_status status; - u8 old_owner = res->owner; mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); @@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; - lock->convert_pending = 0; /* if it failed, move it back to granted queue. * if master returns DLM_NORMAL and then down before sending ast, * it may have already been moved to granted queue, reset to @@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, if (status != DLM_NOTQUEUED) dlm_error(status); dlm_revert_pending_convert(res, lock); - } else if ((res->state & DLM_LOCK_RES_RECOVERING) || - (old_owner != res->owner)) { - mlog(0, "res %.*s is in recovering or has been recovered.\n", - res->lockname.len, res->lockname.name); + } else if (!lock->convert_pending) { + mlog(0, "%s: res %.*s, owner died and lock has been moved back " + "to granted list, retry convert.\n", + dlm->name, res->lockname.len, res->lockname.name); status = DLM_RECOVERING; } + + lock->convert_pending = 0; bail: spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4e7b0dc22450..0b055bfb8e86 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, u64 start, u64 len) { int ret = 0; - u64 tmpend, end = start + len; + u64 tmpend = 0; + u64 end = start + len; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int csize = osb->s_clustersize; handle_t *handle; @@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, } /* - * We want to get the byte offset of the end of the 1st cluster. + * If start is on a cluster boundary and end is somewhere in another + * cluster, we have not COWed the cluster starting at start, unless + * end is also within the same cluster. So, in this case, we skip this + * first call to ocfs2_zero_range_for_truncate() truncate and move on + * to the next one. */ - tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); - if (tmpend > end) - tmpend = end; + if ((start & (csize - 1)) != 0) { + /* + * We want to get the byte offset of the end of the 1st + * cluster. + */ + tmpend = (u64)osb->s_clustersize + + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; - trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, - (unsigned long long)tmpend); + trace_ocfs2_zero_partial_clusters_range1( + (unsigned long long)start, + (unsigned long long)tmpend); - ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); - if (ret) - mlog_errno(ret); + ret = ocfs2_zero_range_for_truncate(inode, handle, start, + tmpend); + if (ret) + mlog_errno(ret); + } if (tmpend < end) { /* diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index ea47120a85ff..6ad3533940ba 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1199,14 +1199,24 @@ retry: inode_unlock((*ac)->ac_inode); ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); - if (ret == 1) + if (ret == 1) { + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; goto retry; + } if (ret < 0) mlog_errno(ret); inode_lock((*ac)->ac_inode); - ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + inode_unlock((*ac)->ac_inode); + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; + goto bail; + } } if (status < 0) { if (status != -ENOSPC) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 54e5d6681786..db37a0e02d32 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -80,6 +80,8 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) } for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + if (ovl_is_private_xattr(name)) + continue; retry: size = vfs_getxattr(old, name, value, value_size); if (size == -ERANGE) @@ -103,6 +105,13 @@ retry: goto retry; } + error = security_inode_copy_up_xattr(name); + if (error < 0 && error != -EOPNOTSUPP) + break; + if (error == 1) { + error = 0; + continue; /* Discard */ + } error = vfs_setxattr(new, name, value, size, 0); if (error) break; @@ -246,6 +255,8 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, struct dentry *upper = NULL; umode_t mode = stat->mode; int err; + const struct cred *old_creds = NULL; + struct cred *new_creds = NULL; newdentry = ovl_lookup_temp(workdir, dentry); err = PTR_ERR(newdentry); @@ -258,10 +269,23 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (IS_ERR(upper)) goto out1; + err = security_inode_copy_up(dentry, &new_creds); + if (err < 0) + goto out2; + + if (new_creds) + old_creds = override_creds(new_creds); + /* Can't properly set mode on creation because of the umask */ stat->mode &= S_IFMT; err = ovl_create_real(wdir, newdentry, stat, link, NULL, true); stat->mode = mode; + + if (new_creds) { + revert_creds(old_creds); + put_cred(new_creds); + } + if (err) goto out2; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 12bcd07b9e32..b0ffa1d1677e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -12,6 +12,8 @@ #include <linux/xattr.h> #include <linux/security.h> #include <linux/cred.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include "overlayfs.h" void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) @@ -186,6 +188,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; + if (!hardlink && !IS_POSIXACL(udir)) + stat->mode &= ~current_umask(); + inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); @@ -335,6 +340,32 @@ out_free: return ret; } +static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, + const struct posix_acl *acl) +{ + void *buffer; + size_t size; + int err; + + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) + return 0; + + size = posix_acl_to_xattr(NULL, acl, NULL, 0); + buffer = kmalloc(size, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + err = size; + if (err < 0) + goto out_free; + + err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE); +out_free: + kfree(buffer); + return err; +} + static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct kstat *stat, const char *link, struct dentry *hardlink) @@ -346,10 +377,18 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct dentry *upper; struct dentry *newdentry; int err; + struct posix_acl *acl, *default_acl; if (WARN_ON(!workdir)) return -EROFS; + if (!hardlink) { + err = posix_acl_create(dentry->d_parent->d_inode, + &stat->mode, &default_acl, &acl); + if (err) + return err; + } + err = ovl_lock_rename_workdir(workdir, upperdir); if (err) goto out; @@ -384,6 +423,17 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; } + if (!hardlink) { + err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS, + acl); + if (err) + goto out_cleanup; + + err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT, + default_acl); + if (err) + goto out_cleanup; + } if (!hardlink && S_ISDIR(stat->mode)) { err = ovl_set_opaque(newdentry); @@ -410,6 +460,10 @@ out_dput: out_unlock: unlock_rename(workdir, upperdir); out: + if (!hardlink) { + posix_acl_release(acl); + posix_acl_release(default_acl); + } return err; out_cleanup: @@ -435,6 +489,15 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, if (override_cred) { override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; + if (!hardlink) { + err = security_dentry_create_files_as(dentry, + stat->mode, &dentry->d_name, old_cred, + override_cred); + if (err) { + put_cred(override_cred); + goto out_revert_creds; + } + } put_cred(override_creds(override_cred)); put_cred(override_cred); @@ -445,6 +508,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, err = ovl_create_over_whiteout(dentry, inode, stat, link, hardlink); } +out_revert_creds: revert_creds(old_cred); if (!err) { struct inode *realinode = d_inode(ovl_dentry_upper(dentry)); @@ -950,9 +1014,9 @@ const struct inode_operations ovl_dir_inode_operations = { .permission = ovl_permission, .getattr = ovl_dir_getattr, .setxattr = generic_setxattr, - .getxattr = ovl_getxattr, + .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = ovl_removexattr, + .removexattr = generic_removexattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 1b885c156028..c75625c1efa3 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -10,6 +10,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/posix_acl.h> #include "overlayfs.h" static int ovl_copy_up_truncate(struct dentry *dentry) @@ -191,32 +192,44 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) return err; } -static bool ovl_is_private_xattr(const char *name) +bool ovl_is_private_xattr(const char *name) { -#define OVL_XATTR_PRE_NAME OVL_XATTR_PREFIX "." - return strncmp(name, OVL_XATTR_PRE_NAME, - sizeof(OVL_XATTR_PRE_NAME) - 1) == 0; + return strncmp(name, OVL_XATTR_PREFIX, + sizeof(OVL_XATTR_PREFIX) - 1) == 0; } -int ovl_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) { int err; - struct dentry *upperdentry; + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); const struct cred *old_cred; err = ovl_want_write(dentry); if (err) goto out; + if (!value && !OVL_TYPE_UPPER(type)) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + goto out_drop_write; + } + err = ovl_copy_up(dentry); if (err) goto out_drop_write; - upperdentry = ovl_dentry_upper(dentry); + if (!OVL_TYPE_UPPER(type)) + ovl_path_upper(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_setxattr(upperdentry, name, value, size, flags); + if (value) + err = vfs_setxattr(realpath.dentry, name, value, size, flags); + else { + WARN_ON(flags != XATTR_REPLACE); + err = vfs_removexattr(realpath.dentry, name); + } revert_creds(old_cred); out_drop_write: @@ -225,16 +238,13 @@ out: return err; } -ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size) +int ovl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; const struct cred *old_cred; - if (ovl_is_private_xattr(name)) - return -ENODATA; - old_cred = ovl_override_creds(dentry->d_sb); res = vfs_getxattr(realdentry, name, value, size); revert_creds(old_cred); @@ -245,7 +255,8 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; - int off; + size_t len; + char *s; const struct cred *old_cred; old_cred = ovl_override_creds(dentry->d_sb); @@ -255,73 +266,39 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; /* filter out private xattrs */ - for (off = 0; off < res;) { - char *s = list + off; - size_t slen = strlen(s) + 1; + for (s = list, len = res; len;) { + size_t slen = strnlen(s, len) + 1; - BUG_ON(off + slen > res); + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > len)) + return -EIO; + len -= slen; if (ovl_is_private_xattr(s)) { res -= slen; - memmove(s, s + slen, res - off); + memmove(s, s + slen, len); } else { - off += slen; + s += slen; } } return res; } -int ovl_removexattr(struct dentry *dentry, const char *name) -{ - int err; - struct path realpath; - enum ovl_path_type type = ovl_path_real(dentry, &realpath); - const struct cred *old_cred; - - err = ovl_want_write(dentry); - if (err) - goto out; - - err = -ENODATA; - if (ovl_is_private_xattr(name)) - goto out_drop_write; - - if (!OVL_TYPE_UPPER(type)) { - err = vfs_getxattr(realpath.dentry, name, NULL, 0); - if (err < 0) - goto out_drop_write; - - err = ovl_copy_up(dentry); - if (err) - goto out_drop_write; - - ovl_path_upper(dentry, &realpath); - } - - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_removexattr(realpath.dentry, name); - revert_creds(old_cred); -out_drop_write: - ovl_drop_write(dentry); -out: - return err; -} - struct posix_acl *ovl_get_acl(struct inode *inode, int type) { struct inode *realinode = ovl_inode_real(inode, NULL); const struct cred *old_cred; struct posix_acl *acl; - if (!IS_POSIXACL(realinode)) + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; if (!realinode->i_op->get_acl) return NULL; old_cred = ovl_override_creds(inode->i_sb); - acl = realinode->i_op->get_acl(realinode, type); + acl = get_acl(realinode, type); revert_creds(old_cred); return acl; @@ -391,9 +368,9 @@ static const struct inode_operations ovl_file_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .setxattr = generic_setxattr, - .getxattr = ovl_getxattr, + .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = ovl_removexattr, + .removexattr = generic_removexattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, }; @@ -404,9 +381,9 @@ static const struct inode_operations ovl_symlink_inode_operations = { .readlink = ovl_readlink, .getattr = ovl_getattr, .setxattr = generic_setxattr, - .getxattr = ovl_getxattr, + .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = ovl_removexattr, + .removexattr = generic_removexattr, .update_time = ovl_update_time, }; @@ -415,6 +392,9 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode) inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; +#ifdef CONFIG_FS_POSIX_ACL + inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; +#endif mode &= S_IFMT; switch (mode) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e4f5c9536bfe..5813ccff8cd9 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -24,8 +24,8 @@ enum ovl_path_type { (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) -#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay" -#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX ".opaque" +#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." +#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" #define OVL_ISUPPER_MASK 1UL @@ -179,20 +179,21 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); int ovl_check_d_type_supported(struct path *realpath); +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level); /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); int ovl_permission(struct inode *inode, int mask); -int ovl_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags); -ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size); +int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int ovl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); -int ovl_removexattr(struct dentry *dentry, const char *name); struct posix_acl *ovl_get_acl(struct inode *inode, int type); int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); +bool ovl_is_private_xattr(const char *name); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode); struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index cf37fc76fc9f..f241b4ee3d8a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -248,7 +248,7 @@ static inline int ovl_dir_read(struct path *realpath, err = rdd->err; } while (!err && rdd->count); - if (!err && rdd->first_maybe_whiteout) + if (!err && rdd->first_maybe_whiteout && rdd->dentry) err = ovl_check_whiteouts(realpath->dentry, rdd); fput(realfile); @@ -606,3 +606,64 @@ int ovl_check_d_type_supported(struct path *realpath) return rdd.d_type_supported; } + +static void ovl_workdir_cleanup_recurse(struct path *path, int level) +{ + int err; + struct inode *dir = path->dentry->d_inode; + LIST_HEAD(list); + struct ovl_cache_entry *p; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = NULL, + .list = &list, + .root = RB_ROOT, + .is_lowest = false, + }; + + err = ovl_dir_read(path, &rdd); + if (err) + goto out; + + inode_lock_nested(dir, I_MUTEX_PARENT); + list_for_each_entry(p, &list, l_node) { + struct dentry *dentry; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + dentry = lookup_one_len(p->name, path->dentry, p->len); + if (IS_ERR(dentry)) + continue; + if (dentry->d_inode) + ovl_workdir_cleanup(dir, path->mnt, dentry, level); + dput(dentry); + } + inode_unlock(dir); +out: + ovl_cache_free(&list); +} + +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level) +{ + int err; + + if (!d_is_dir(dentry) || level > 1) { + ovl_cleanup(dir, dentry); + return; + } + + err = ovl_do_rmdir(dir, dentry); + if (err) { + struct path path = { .mnt = mnt, .dentry = dentry }; + + inode_unlock(dir); + ovl_workdir_cleanup_recurse(&path, level + 1); + inode_lock_nested(dir, I_MUTEX_PARENT); + ovl_cleanup(dir, dentry); + } +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 4036132842b5..e2a94a26767b 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -814,6 +814,10 @@ retry: struct kstat stat = { .mode = S_IFDIR | 0, }; + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat.mode, + }; if (work->d_inode) { err = -EEXIST; @@ -821,7 +825,7 @@ retry: goto out_dput; retried = true; - ovl_cleanup(dir, work); + ovl_workdir_cleanup(dir, mnt, work, 0); dput(work); goto retry; } @@ -829,6 +833,21 @@ retry: err = ovl_create_real(dir, work, &stat, NULL, NULL, true); if (err) goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + /* Clear any inherited mode bits */ + inode_lock(work->d_inode); + err = notify_change(work, &attr, NULL); + inode_unlock(work->d_inode); + if (err) + goto out_dput; } out_unlock: inode_unlock(dir); @@ -967,10 +986,19 @@ static unsigned int ovl_split_lowerdirs(char *str) return ctr; } -static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +static int __maybe_unused +ovl_posix_acl_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, handler->name, buffer, size); +} + +static int __maybe_unused +ovl_posix_acl_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { struct dentry *workdir = ovl_workdir(dentry); struct inode *realinode = ovl_inode_real(inode, NULL); @@ -998,19 +1026,22 @@ static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler, posix_acl_release(acl); - return ovl_setxattr(dentry, inode, handler->name, value, size, flags); + err = ovl_xattr_set(dentry, handler->name, value, size, flags); + if (!err) + ovl_copyattr(ovl_inode_real(inode, NULL), inode); + + return err; out_acl_release: posix_acl_release(acl); return err; } -static int ovl_other_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +static int ovl_own_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) { - return ovl_setxattr(dentry, inode, name, value, size, flags); + return -EPERM; } static int ovl_own_xattr_set(const struct xattr_handler *handler, @@ -1021,42 +1052,59 @@ static int ovl_own_xattr_set(const struct xattr_handler *handler, return -EPERM; } -static const struct xattr_handler ovl_posix_acl_access_xattr_handler = { +static int ovl_other_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, name, buffer, size); +} + +static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ovl_xattr_set(dentry, name, value, size, flags); +} + +static const struct xattr_handler __maybe_unused +ovl_posix_acl_access_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, + .get = ovl_posix_acl_xattr_get, .set = ovl_posix_acl_xattr_set, }; -static const struct xattr_handler ovl_posix_acl_default_xattr_handler = { +static const struct xattr_handler __maybe_unused +ovl_posix_acl_default_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, + .get = ovl_posix_acl_xattr_get, .set = ovl_posix_acl_xattr_set, }; static const struct xattr_handler ovl_own_xattr_handler = { .prefix = OVL_XATTR_PREFIX, + .get = ovl_own_xattr_get, .set = ovl_own_xattr_set, }; static const struct xattr_handler ovl_other_xattr_handler = { .prefix = "", /* catch all */ + .get = ovl_other_xattr_get, .set = ovl_other_xattr_set, }; static const struct xattr_handler *ovl_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL &ovl_posix_acl_access_xattr_handler, &ovl_posix_acl_default_xattr_handler, +#endif &ovl_own_xattr_handler, &ovl_other_xattr_handler, NULL }; -static const struct xattr_handler *ovl_xattr_noacl_handlers[] = { - &ovl_own_xattr_handler, - &ovl_other_xattr_handler, - NULL, -}; - static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { NULL, NULL }; @@ -1132,7 +1180,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); if (stacklen > OVL_MAX_STACK) { - pr_err("overlayfs: too many lower directries, limit is %d\n", + pr_err("overlayfs: too many lower directories, limit is %d\n", OVL_MAX_STACK); goto out_free_lowertmp; } else if (!ufs->config.upperdir && stacklen == 1) { @@ -1269,10 +1317,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_op = &ovl_super_operations; - if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) - sb->s_xattr = ovl_xattr_handlers; - else - sb->s_xattr = ovl_xattr_noacl_handlers; + sb->s_xattr = ovl_xattr_handlers; sb->s_root = root_dentry; sb->s_fs_info = ufs; sb->s_flags |= MS_POSIXACL; diff --git a/fs/proc/base.c b/fs/proc/base.c index 54e270262979..3b792ab3c0dc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -483,7 +483,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%pK>] %pS\n", + seq_printf(m, "[<%pK>] %pB\n", (void *)entries[i], (void *)entries[i]); } unlock_trace(task); @@ -1556,18 +1556,13 @@ static const struct file_operations proc_pid_set_comm_operations = { static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; - struct mm_struct *mm; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; - mm = get_task_mm(task); + exe_file = get_task_exe_file(task); put_task_struct(task); - if (!mm) - return -ENOENT; - exe_file = get_mm_exe_file(mm); - mmput(mm); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index c633476616e0..bca66d83a765 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -390,6 +390,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, atomic_set(&ent->count, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); + proc_set_user(ent, (*parent)->uid, (*parent)->gid); + out: return ent; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index a939f5ed7f89..5c89a07e3d7f 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) static ssize_t read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { + char *buf = file->private_data; ssize_t acc = 0; size_t size, tsz; size_t elf_buflen; @@ -500,23 +501,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (clear_user(buffer, tsz)) return -EFAULT; } else if (is_vmalloc_or_module_addr((void *)start)) { - char * elf_buf; - - elf_buf = kzalloc(tsz, GFP_KERNEL); - if (!elf_buf) - return -ENOMEM; - vread(elf_buf, (char *)start, tsz); + vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, elf_buf, tsz)) { - kfree(elf_buf); + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; - } - kfree(elf_buf); } else { if (kern_addr_valid(start)) { unsigned long n; - n = copy_to_user(buffer, (char *)start, tsz); + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + memcpy(buf, (char *) start, tsz); + n = copy_to_user(buffer, buf, tsz); /* * We cannot distinguish between fault on source * and fault on destination. When this happens @@ -549,6 +547,11 @@ static int open_kcore(struct inode *inode, struct file *filp) { if (!capable(CAP_SYS_RAWIO)) return -EPERM; + + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -559,10 +562,16 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } +static int release_kcore(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, + .release = release_kcore, .llseek = default_llseek, }; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index c8bbc68cdb05..7ae6b1da7cab 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -21,6 +21,7 @@ #include <linux/bitops.h> #include <linux/mount.h> #include <linux/nsproxy.h> +#include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> @@ -185,6 +186,8 @@ const struct file_operations proc_net_operations = { static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; + kuid_t uid; + kgid_t gid; int err; err = -ENOMEM; @@ -199,6 +202,16 @@ static __net_init int proc_net_ns_init(struct net *net) netd->parent = &proc_root; memcpy(netd->name, "net", 4); + uid = make_kuid(net->user_ns, 0); + if (!uid_valid(uid)) + uid = netd->uid; + + gid = make_kgid(net->user_ns, 0); + if (!gid_valid(gid)) + gid = netd->gid; + + proc_set_user(netd, uid, gid); + err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1b93650dda2f..2ed3d71d4767 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -430,6 +430,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { + struct ctl_table_root *root = head->root; struct inode *inode; struct proc_inode *ei; @@ -457,6 +458,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (is_empty_dir(head)) make_empty_dir_inode(inode); } + + if (root->set_ownership) + root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); + out: return inode; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 187d84ef9de9..f6fa99eca515 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, mss->anonymous_thp += HPAGE_PMD_SIZE; else if (PageSwapBacked(page)) mss->shmem_thp += HPAGE_PMD_SIZE; + else if (is_zone_device_page(page)) + /* pass */; else VM_BUG_ON_PAGE(1, page); smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 35df08ee9c97..2d445425aad7 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -341,6 +341,7 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs) struct qc_state state; int ret; + memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; @@ -365,17 +366,19 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs) fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; - if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + + /* Inodes may be allocated even if inactive; copy out if present */ + if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } - if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } - if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[PRJQUOTA].ino) { /* * Q_XGETQSTAT doesn't have room for both group and project * quotas. So, allow the project quota values to be copied out @@ -411,6 +414,7 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs) struct qc_state state; int ret; + memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; @@ -435,17 +439,19 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs) fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; - if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + + /* Inodes may be allocated even if inactive; copy out if present */ + if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } - if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } - if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[PRJQUOTA].ino) { fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino; fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks; fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents; diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 183a212694bf..12af0490322f 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -27,9 +27,17 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/ramfs.h> +#include <linux/sched.h> #include "internal.h" +static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + const struct file_operations ramfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, @@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, + .get_unmapped_area = ramfs_mmu_get_unmapped_area, }; const struct inode_operations ramfs_file_inode_operations = { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7a4a85a6821e..74d5ddd26296 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -190,7 +190,15 @@ static int remove_save_link_only(struct super_block *s, static int reiserfs_quota_on_mount(struct super_block *, int); #endif -/* look for uncompleted unlinks and truncates and complete them */ +/* + * Look for uncompleted unlinks and truncates and complete them + * + * Called with superblock write locked. If quotas are enabled, we have to + * release/retake lest we call dquot_quota_on_mount(), proceed to + * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per + * cpu worklets to complete flush_async_commits() that in turn wait for the + * superblock write lock. + */ static int finish_unfinished(struct super_block *s) { INITIALIZE_PATH(path); @@ -237,7 +245,9 @@ static int finish_unfinished(struct super_block *s) quota_enabled[i] = 0; continue; } + reiserfs_write_unlock(s); ret = reiserfs_quota_on_mount(s, i); + reiserfs_write_lock(s); if (ret < 0) reiserfs_warning(s, "reiserfs-2500", "cannot turn on journaled " diff --git a/fs/seq_file.c b/fs/seq_file.c index 19f532e7d35e..6dc4296eed62 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -223,8 +223,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) size -= n; buf += n; copied += n; - if (!m->count) + if (!m->count) { + m->from = 0; m->index++; + } if (!size) goto Done; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f35523d4fa3a..b803213d1307 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -114,9 +114,15 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, * If buf != of->prealloc_buf, we don't know how * large it is, so cannot safely pass it to ->show */ - if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) + if (WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); + if (pos) { + if (len <= pos) + return 0; + len -= pos; + memmove(buf, buf + pos, len); + } return min(count, len); } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index dc1358b5ec95..ac2de0ed69ad 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -233,8 +233,8 @@ void sysfs_remove_group(struct kobject *kobj, kn = kernfs_find_and_get(parent, grp->name); if (!kn) { WARN(!kn, KERN_WARNING - "sysfs group %p not found for kobject '%s'\n", - grp, kobject_name(kobj)); + "sysfs group '%s' not found for kobject '%s'\n", + grp->name, kobject_name(kobj)); return; } } else { diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index b45345d701e7..51157da3f76e 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -370,7 +370,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) p = c->gap_lebs; do { - ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); + ubifs_assert(p < c->gap_lebs + c->lst.idx_lebs); written = layout_leb_in_gaps(c, p); if (written < 0) { err = written; diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e237811f09ce..11a004114eba 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -575,7 +575,8 @@ static int ubifs_xattr_get(const struct xattr_handler *handler, dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, inode->i_ino, dentry, size); - return __ubifs_getxattr(inode, name, buffer, size); + name = xattr_full_name(handler, name); + return __ubifs_getxattr(inode, name, buffer, size); } static int ubifs_xattr_set(const struct xattr_handler *handler, @@ -586,6 +587,8 @@ static int ubifs_xattr_set(const struct xattr_handler *handler, dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name, inode->i_ino, dentry, size); + name = xattr_full_name(handler, name); + if (value) return __ubifs_setxattr(inode, name, value, size, flags); else diff --git a/fs/udf/file.c b/fs/udf/file.c index 632570617327..e855bf8d74b4 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -94,7 +94,7 @@ static int udf_adinicb_write_begin(struct file *file, return -ENOMEM; *pagep = page; - if (!PageUptodate(page) && len != PAGE_SIZE) + if (!PageUptodate(page)) __udf_adinicb_readpage(page); return 0; } @@ -105,11 +105,25 @@ static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; } +static int udf_adinicb_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + set_page_dirty(page); + unlock_page(page); + put_page(page); + return copied; +} + const struct address_space_operations udf_adinicb_aops = { .readpage = udf_adinicb_readpage, .writepage = udf_adinicb_writepage, .write_begin = udf_adinicb_write_begin, - .write_end = simple_write_end, + .write_end = udf_adinicb_write_end, .direct_IO = udf_adinicb_direct_IO, }; |