diff options
Diffstat (limited to 'fs')
151 files changed, 2517 insertions, 1280 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 4b0eff6da674..85737e96ab8b 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -189,11 +189,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, case 1: _debug("extract FID count"); ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); @@ -210,11 +207,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, _debug("extract FID array"); ret = afs_extract_data(call, skb, last, call->buffer, call->count * 3 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; _debug("unmarshall FID array"); call->request = kcalloc(call->count, @@ -239,11 +233,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, case 3: _debug("extract CB count"); ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; tmp = ntohl(call->tmp); _debug("CB count: %u", tmp); @@ -258,11 +249,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, _debug("extract CB array"); ret = afs_extract_data(call, skb, last, call->request, call->count * 3 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; _debug("unmarshall CB array"); cb = call->request; @@ -278,9 +266,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, call->unmarshall++; case 5: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; /* Record that the message was unmarshalled successfully so * that the call destructor can know do the callback breaking @@ -294,8 +282,6 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, break; } - if (!last) - return 0; call->state = AFS_CALL_REPLYING; @@ -335,13 +321,13 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, { struct afs_server *server; struct in_addr addr; + int ret; _enter(",{%u},%d", skb->len, last); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; @@ -371,8 +357,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, _enter(",{%u},%d", skb->len, last); + /* There are some arguments that we ignore */ + afs_data_consumed(call, skb); if (!last) - return 0; + return -EAGAIN; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; @@ -408,12 +396,13 @@ static void SRXAFSCB_Probe(struct work_struct *work) static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, bool last) { + int ret; + _enter(",{%u},%d", skb->len, last); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; @@ -460,10 +449,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; switch (call->unmarshall) { case 0: @@ -509,8 +497,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, break; } - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; call->state = AFS_CALL_REPLYING; @@ -588,12 +577,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, struct sk_buff *skb, bool last) { + int ret; + _enter(",{%u},%d", skb->len, last); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index c2e930ec2888..9312b92e54be 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -240,15 +240,13 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter(",,%u", last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -335,11 +333,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, case 1: _debug("extract data length (MSW)"); ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("DATA length MSW: %u", call->count); @@ -353,11 +348,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, case 2: _debug("extract data length"); ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("DATA length: %u", call->count); @@ -375,11 +367,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, ret = afs_extract_data(call, skb, last, buffer, call->count); kunmap_atomic(buffer); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; } call->offset = 0; @@ -389,11 +378,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, case 4: ret = afs_extract_data(call, skb, last, call->buffer, (21 + 3 + 6) * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; bp = call->buffer; xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); @@ -405,15 +391,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, call->unmarshall++; case 5: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; break; } - if (!last) - return 0; - if (call->count < PAGE_SIZE) { _debug("clear"); page = call->reply3; @@ -537,9 +520,8 @@ static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, { _enter(",{%u},%d", skb->len, last); - if (skb->len > 0) - return -EBADMSG; /* shouldn't be any reply data */ - return 0; + /* shouldn't be any reply data */ + return afs_data_complete(call, skb, last); } /* @@ -622,15 +604,13 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -721,15 +701,13 @@ static int afs_deliver_fs_remove(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -804,15 +782,13 @@ static int afs_deliver_fs_link(struct afs_call *call, { struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -892,15 +868,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -999,15 +973,13 @@ static int afs_deliver_fs_rename(struct afs_call *call, { struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -1105,20 +1077,13 @@ static int afs_deliver_fs_store_data(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter(",,%u", last); - afs_transfer_reply(call, skb); - if (!last) { - _leave(" = 0 [more]"); - return 0; - } - - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -1292,20 +1257,13 @@ static int afs_deliver_fs_store_status(struct afs_call *call, afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter(",,%u", last); - afs_transfer_reply(call, skb); - if (!last) { - _leave(" = 0 [more]"); - return 0; - } - - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ store_version = NULL; @@ -1504,11 +1462,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, _debug("extract status"); ret = afs_extract_data(call, skb, last, call->buffer, 12 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; bp = call->buffer; xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2); @@ -1518,11 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the volume name length */ case 2: ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); @@ -1537,11 +1489,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, if (call->count > 0) { ret = afs_extract_data(call, skb, last, call->reply3, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; } p = call->reply3; @@ -1561,11 +1510,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 4: ret = afs_extract_data(call, skb, last, call->buffer, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->offset = 0; call->unmarshall++; @@ -1574,11 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the offline message length */ case 5: ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); @@ -1593,11 +1536,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, if (call->count > 0) { ret = afs_extract_data(call, skb, last, call->reply3, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; } p = call->reply3; @@ -1617,11 +1557,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 7: ret = afs_extract_data(call, skb, last, call->buffer, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->offset = 0; call->unmarshall++; @@ -1630,11 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the message of the day length */ case 8: ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); @@ -1649,11 +1583,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, if (call->count > 0) { ret = afs_extract_data(call, skb, last, call->reply3, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; } p = call->reply3; @@ -1673,26 +1604,20 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 10: ret = afs_extract_data(call, skb, last, call->buffer, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->offset = 0; call->unmarshall++; no_motd_padding: case 11: - _debug("trailer %d", skb->len); - if (skb->len != 0) - return -EBADMSG; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; break; } - if (!last) - return 0; - _leave(" = 0 [done]"); return 0; } @@ -1764,15 +1689,13 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call, struct sk_buff *skb, bool last) { const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 71d5982312f3..df976b2a7f40 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -609,17 +609,29 @@ extern void afs_proc_cell_remove(struct afs_cell *); */ extern int afs_open_socket(void); extern void afs_close_socket(void); +extern void afs_data_consumed(struct afs_call *, struct sk_buff *); extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, const struct afs_wait_mode *); extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); -extern void afs_transfer_reply(struct afs_call *, struct sk_buff *); +extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, size_t); +static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + if (skb->len > 0) + return -EBADMSG; + afs_data_consumed(call, skb); + if (!last) + return -EAGAIN; + return 0; +} + /* * security.c */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 4832de84d52c..14d04c848465 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -150,10 +150,9 @@ void afs_close_socket(void) } /* - * note that the data in a socket buffer is now delivered and that the buffer - * should be freed + * Note that the data in a socket buffer is now consumed. */ -static void afs_data_delivered(struct sk_buff *skb) +void afs_data_consumed(struct afs_call *call, struct sk_buff *skb) { if (!skb) { _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); @@ -161,9 +160,7 @@ static void afs_data_delivered(struct sk_buff *skb) } else { _debug("DLVR %p{%u} [%d]", skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_data_delivered(skb); + rxrpc_kernel_data_consumed(call->rxcall, skb); } } @@ -489,9 +486,15 @@ static void afs_deliver_to_call(struct afs_call *call) last = rxrpc_kernel_is_data_last(skb); ret = call->type->deliver(call, skb, last); switch (ret) { + case -EAGAIN: + if (last) { + _debug("short data"); + goto unmarshal_error; + } + break; case 0: - if (last && - call->state == AFS_CALL_AWAIT_REPLY) + ASSERT(last); + if (call->state == AFS_CALL_AWAIT_REPLY) call->state = AFS_CALL_COMPLETE; break; case -ENOTCONN: @@ -501,6 +504,7 @@ static void afs_deliver_to_call(struct afs_call *call) abort_code = RX_INVALID_OPERATION; goto do_abort; default: + unmarshal_error: abort_code = RXGEN_CC_UNMARSHAL; if (call->state != AFS_CALL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; @@ -511,9 +515,7 @@ static void afs_deliver_to_call(struct afs_call *call) call->state = AFS_CALL_ERROR; break; } - afs_data_delivered(skb); - skb = NULL; - continue; + break; case RXRPC_SKB_MARK_FINAL_ACK: _debug("Rcv ACK"); call->state = AFS_CALL_COMPLETE; @@ -685,15 +687,35 @@ static void afs_process_async_call(struct afs_call *call) } /* - * empty a socket buffer into a flat reply buffer + * Empty a socket buffer into a flat reply buffer. */ -void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) +int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last) { size_t len = skb->len; - if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0) - BUG(); - call->reply_size += len; + if (len > call->reply_max - call->reply_size) { + _leave(" = -EBADMSG [%zu > %u]", + len, call->reply_max - call->reply_size); + return -EBADMSG; + } + + if (len > 0) { + if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, + len) < 0) + BUG(); + call->reply_size += len; + } + + afs_data_consumed(call, skb); + if (!last) + return -EAGAIN; + + if (call->reply_size != call->reply_max) { + _leave(" = -EBADMSG [%u != %u]", + call->reply_size, call->reply_max); + return -EBADMSG; + } + return 0; } /* @@ -745,7 +767,8 @@ static void afs_collect_incoming_call(struct work_struct *work) } /* - * grab the operation ID from an incoming cache manager call + * Grab the operation ID from an incoming cache manager call. The socket + * buffer is discarded on error or if we don't yet have sufficient data. */ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, bool last) @@ -766,12 +789,9 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, call->offset += len; if (call->offset < 4) { - if (last) { - _leave(" = -EBADMSG [op ID short]"); - return -EBADMSG; - } - _leave(" = 0 [incomplete]"); - return 0; + afs_data_consumed(call, skb); + _leave(" = -EAGAIN"); + return -EAGAIN; } call->state = AFS_CALL_AWAIT_REQUEST; @@ -855,7 +875,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) } /* - * extract a piece of data from the received data socket buffers + * Extract a piece of data from the received data socket buffers. */ int afs_extract_data(struct afs_call *call, struct sk_buff *skb, bool last, void *buf, size_t count) @@ -873,10 +893,7 @@ int afs_extract_data(struct afs_call *call, struct sk_buff *skb, call->offset += len; if (call->offset < count) { - if (last) { - _leave(" = -EBADMSG [%d < %zu]", call->offset, count); - return -EBADMSG; - } + afs_data_consumed(call, skb); _leave(" = -EAGAIN"); return -EAGAIN; } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 340afd0cd182..f94d1abdc3eb 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -64,16 +64,13 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, struct afs_cache_vlocation *entry; __be32 *bp; u32 tmp; - int loop; + int loop, ret; _enter(",,%u", last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ entry = call->reply; @@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); + struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops, + AIO_RING_MAGIC); + + if (!IS_ERR(root)) + root->d_sb->s_iflags |= SB_I_NOEXEC; + return root; } /* aio_setup @@ -269,14 +274,17 @@ __initcall(aio_setup); static void put_aio_ring_file(struct kioctx *ctx) { struct file *aio_ring_file = ctx->aio_ring_file; + struct address_space *i_mapping; + if (aio_ring_file) { truncate_setsize(aio_ring_file->f_inode, 0); /* Prevent further access to the kioctx from migratepages */ - spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock); - aio_ring_file->f_inode->i_mapping->private_data = NULL; + i_mapping = aio_ring_file->f_inode->i_mapping; + spin_lock(&i_mapping->private_lock); + i_mapping->private_data = NULL; ctx->aio_ring_file = NULL; - spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock); + spin_unlock(&i_mapping->private_lock); fput(aio_ring_file); } diff --git a/fs/attr.c b/fs/attr.c index 83c8430d908f..a19a64d41e7e 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -215,6 +215,21 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return -EPERM; } + /* + * If utimes(2) and friends are called with times == NULL (or both + * times are UTIME_NOW), then we need to check for write permission + */ + if (ia_valid & ATTR_TOUCH) { + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (!inode_owner_or_capable(inode)) { + error = inode_permission(inode, MAY_WRITE); + if (error) + return error; + } + } + if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index b493909e7492..d8e6d421c27f 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry, } return NULL; } + /* * Find an eligible tree to time-out * A tree is eligible if :- @@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, struct dentry *root = sb->s_root; struct dentry *dentry; struct dentry *expired; + struct dentry *found; struct autofs_info *ino; if (!root) @@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { + int flags = how; + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_WANT_EXPIRE) - expired = NULL; - else - expired = should_expire(dentry, mnt, timeout, how); - if (!expired) { + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; } + spin_unlock(&sbi->fs_lock); + + expired = should_expire(dentry, mnt, timeout, flags); + if (!expired) + continue; + + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); - if (should_expire(expired, mnt, timeout, how)) { - if (expired != dentry) - dput(dentry); - goto found; - } + /* Make sure a reference is not taken on found if + * things have changed. + */ + flags &= ~AUTOFS_EXP_LEAVES; + found = should_expire(expired, mnt, timeout, how); + if (!found || found != expired) + /* Something has changed, continue */ + goto next; + + if (expired != dentry) + dput(dentry); + + spin_lock(&sbi->fs_lock); + goto found; +next: + spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); if (expired != dentry) dput(expired); - spin_unlock(&sbi->fs_lock); } return NULL; @@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; + int state; /* Block on any pending expire */ if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) @@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) if (rcu_walk) return -ECHILD; +retry: spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_EXPIRING) { + state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING); + if (state == AUTOFS_INF_WANT_EXPIRE) { + spin_unlock(&sbi->fs_lock); + /* + * Possibly being selected for expire, wait until + * it's selected or not. + */ + schedule_timeout_uninterruptible(HZ/10); + goto retry; + } + if (state & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7f6aff3f72eb..e5495f37c6ed 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -853,6 +853,7 @@ static int load_elf_binary(struct linux_binprm *bprm) current->flags |= PF_RANDOMIZE; setup_new_exec(bprm); + install_exec_creds(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ @@ -1044,7 +1045,6 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - install_exec_creds(bprm); retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); if (retval < 0) diff --git a/fs/block_dev.c b/fs/block_dev.c index c3cdde87cc8c..08ae99343d92 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -249,7 +249,8 @@ struct super_block *freeze_bdev(struct block_device *bdev) * thaw_bdev drops it. */ sb = get_super(bdev); - drop_super(sb); + if (sb) + drop_super(sb); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } @@ -646,7 +647,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type, { struct dentry *dent; dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); - if (dent) + if (!IS_ERR(dent)) dent->d_sb->s_iflags |= SB_I_CGROUPWB; return dent; } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2b88439c2ee8..455a6b2fd539 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode) list_del(&ref2->list); kmem_cache_free(btrfs_prelim_ref_cache, ref2); + cond_resched(); } } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2fe8f89091a3..e62fd50237e4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -427,6 +427,7 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + u64 tickets_id; struct rw_semaphore groups_sem; /* for block groups in our same type */ @@ -1028,6 +1029,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; + bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ /* filesystem state */ unsigned long fs_state; @@ -1079,6 +1081,8 @@ struct btrfs_fs_info { struct list_head pinned_chunks; int creating_free_space_tree; + /* Used to record internally whether fs has been frozen */ + int fs_frozen; }; struct btrfs_subvolume_writers { @@ -2578,7 +2582,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3157,7 +3161,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_inode_set_ops(struct inode *inode); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index b6d210e7a993..ac02e041464b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, - delayed_refs, - qrecord); - if (qexisting) + if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info, + delayed_refs, qrecord)) kfree(qrecord); } @@ -862,33 +859,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 ref_root, u64 bytenr, u64 num_bytes) -{ - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *ref_head; - int ret = 0; - - if (!fs_info->quota_enabled || !is_fstree(ref_root)) - return 0; - - delayed_refs = &trans->transaction->delayed_refs; - - spin_lock(&delayed_refs->lock); - ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0); - if (!ref_head) { - ret = -ENOENT; - goto out; - } - WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root); - ref_head->qgroup_ref_root = ref_root; - ref_head->qgroup_reserved = num_bytes; -out: - spin_unlock(&delayed_refs->lock); - return ret; -} - int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 5fca9534a271..43f3629760e9 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -250,9 +250,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 parent, u64 ref_root, u64 owner, u64 offset, u64 reserved, int action, struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 ref_root, u64 bytenr, u64 num_bytes); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59febfb8d04a..54bc8c7c6bcd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -559,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root, u32 nritems = btrfs_header_nritems(leaf); int slot; - if (nritems == 0) + if (nritems == 0) { + struct btrfs_root *check_root; + + key.objectid = btrfs_header_owner(leaf); + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + check_root = btrfs_get_fs_root(root->fs_info, &key, false); + /* + * The only reason we also check NULL here is that during + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { + /* if leaf is the root, then it's fine */ + if (leaf->start != + btrfs_root_bytenr(&check_root->root_item)) { + CORRUPT("non-root leaf's nritems is 0", + leaf, root, 0); + return -EIO; + } + } return 0; + } /* Check the 0 item */ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != @@ -612,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } +static int check_node(struct btrfs_root *root, struct extent_buffer *node) +{ + unsigned long nr = btrfs_header_nritems(node); + + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) { + btrfs_crit(root->fs_info, + "corrupt node: block %llu root %llu nritems %lu", + node->start, root->objectid, nr); + return -EIO; + } + return 0; +} + static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -682,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; } + if (found_level > 0 && check_node(root, eb)) + ret = -EIO; + if (!ret) set_extent_buffer_uptodate(eb); err: @@ -1618,8 +1655,8 @@ fail: return ret; } -static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; @@ -2298,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; fs_info->qgroup_ulist = NULL; + fs_info->qgroup_rescan_running = false; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2624,6 +2662,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->fs_frozen = 0; fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; @@ -3739,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); + if (root->reloc_root) { + free_extent_buffer(root->reloc_root->node); + free_extent_buffer(root->reloc_root->commit_root); + btrfs_put_fs_root(root->reloc_root); + root->reloc_root = NULL; + } + } if (root->free_ino_pinned) __btrfs_remove_free_space_cache(root->free_ino_pinned); @@ -3851,7 +3897,7 @@ void close_ctree(struct btrfs_root *root) smp_mb(); /* wait for the qgroup rescan worker to stop */ - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b3207a0e09f7..f19a982f5a4f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 61b494e8e604..665da8f66ff1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -60,21 +60,6 @@ enum { CHUNK_ALLOC_FORCE = 2, }; -/* - * Control how reservations are dealt with. - * - * RESERVE_FREE - freeing a reservation. - * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for - * ENOSPC accounting - * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update - * bytes_may_use as the ENOSPC accounting is done elsewhere - */ -enum { - RESERVE_FREE = 0, - RESERVE_ALLOC = 1, - RESERVE_ALLOC_NO_ACCOUNT = 2, -}; - static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -104,9 +89,10 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, - int delalloc); +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc); +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, @@ -3501,7 +3487,6 @@ again: dcs = BTRFS_DC_SETUP; else if (ret == -ENOSPC) set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); - btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -4286,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) if (ret < 0) return ret; - /* - * Use new btrfs_qgroup_reserve_data to reserve precious data space - * - * TODO: Find a good method to avoid reserve data space for NOCOW - * range, but don't impact performance on quota disable case. - */ + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, start, len); + if (ret) + btrfs_free_reserved_data_space_noquota(inode, start, len); return ret; } @@ -4472,6 +4454,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } } +/* + * If force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + * If force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + */ static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force) { @@ -4882,7 +4873,7 @@ static int flush_space(struct btrfs_root *root, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); - if (ret == -ENOSPC) + if (ret > 0 || ret == -ENOSPC) ret = 0; break; case COMMIT_TRANS: @@ -4907,11 +4898,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, u64 expected; u64 to_reclaim = 0; - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return 0; - list_for_each_entry(ticket, &space_info->tickets, list) to_reclaim += ticket->bytes; list_for_each_entry(ticket, &space_info->priority_tickets, list) @@ -4919,6 +4905,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, if (to_reclaim) return to_reclaim; + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) + return 0; + used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -4972,12 +4963,12 @@ static void wake_all_tickets(struct list_head *head) */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { - struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; int commit_cycles = 0; + u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -4990,8 +4981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - last_ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); + last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; @@ -5011,10 +5001,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (last_ticket == ticket) { + if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { - last_ticket = ticket; + last_tickets_id = space_info->tickets_id; flush_state = FLUSH_DELAYED_ITEMS_NR; if (commit_cycles) commit_cycles--; @@ -5390,6 +5380,7 @@ again: list_del_init(&ticket->list); num_bytes -= ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { ticket->bytes -= num_bytes; @@ -5432,6 +5423,7 @@ again: num_bytes -= ticket->bytes; space_info->bytes_may_use += ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { trace_btrfs_space_reservation(fs_info, "space_info", @@ -6497,19 +6489,15 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) } /** - * btrfs_update_reserved_bytes - update the block_group and space info counters + * btrfs_add_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating + * @ram_bytes: The number of bytes of file content, and will be same to + * @num_bytes except for the compress path. * @num_bytes: The number of bytes in question - * @reserve: One of the reservation enums * @delalloc: The blocks are allocated for the delalloc write * - * This is called by the allocator when it reserves space, or by somebody who is - * freeing space that was never actually used on disk. For example if you - * reserve some space for a new leaf in transaction A and before transaction A - * commits you free that leaf, you call this with reserve set to 0 in order to - * clear the reservation. - * - * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * This is called by the allocator when it reserves space. Metadata + * reservations should be called with RESERVE_ALLOC so we do the proper * ENOSPC accounting. For data we handle the reservation through clearing the * delalloc bits in the io_tree. We have to do this since we could end up * allocating less disk space for the amount of data we have reserved in the @@ -6519,44 +6507,63 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) * make the reservation and return -EAGAIN, otherwise this function always * succeeds. */ -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int delalloc) +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (reserve != RESERVE_FREE) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - if (reserve == RESERVE_ALLOC) { - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - num_bytes, 0); - space_info->bytes_may_use -= num_bytes; - } - - if (delalloc) - cache->delalloc_bytes += num_bytes; - } + if (cache->ro) { + ret = -EAGAIN; } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, + "space_info", space_info->flags, + ram_bytes, 0); + space_info->bytes_may_use -= ram_bytes; if (delalloc) - cache->delalloc_bytes -= num_bytes; + cache->delalloc_bytes += num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; } +/** + * btrfs_free_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk. For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ + +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -7191,7 +7198,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; @@ -7416,9 +7423,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * the free space extent currently. */ static noinline int find_free_extent(struct btrfs_root *orig_root, - u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, - u64 flags, int delalloc) + u64 ram_bytes, u64 num_bytes, u64 empty_size, + u64 hint_byte, struct btrfs_key *ins, + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -7430,8 +7437,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); - int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? - RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -7763,8 +7768,8 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, - alloc_type, delalloc); + ret = btrfs_add_reserved_bytes(block_group, ram_bytes, + num_bytes, delalloc); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -7936,7 +7941,7 @@ again: up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) @@ -7948,8 +7953,8 @@ int btrfs_reserve_extent(struct btrfs_root *root, flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, - flags, delalloc); + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, + hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(root->fs_info, ins->objectid); @@ -7958,6 +7963,7 @@ again: num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); + ram_bytes = num_bytes; if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -7995,7 +8001,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + btrfs_free_reserved_bytes(cache, len, delalloc); trace_btrfs_reserved_extent_free(root, start, len); } @@ -8208,6 +8214,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; /* * Mixed block groups will exclude before processing the log so we only @@ -8223,9 +8230,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!block_group) return -EINVAL; - ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT, 0); - BUG_ON(ret); /* logic error */ + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + space_info->bytes_reserved += ins->offset; + block_group->reserved += ins->offset; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); btrfs_put_block_group(block_group); @@ -8368,7 +8380,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); - ret = btrfs_reserve_extent(root, blocksize, blocksize, + ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, empty_size, hint, &ins, 0, 0); if (ret) goto out_unuse; @@ -8521,35 +8533,6 @@ reada: wc->reada_slot = slot; } -/* - * These may not be seen by the usual inc/dec ref code so we have to - * add them here. - */ -static int record_one_subtree_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes) -{ - struct btrfs_qgroup_extent_record *qrecord; - struct btrfs_delayed_ref_root *delayed_refs; - - qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); - if (!qrecord) - return -ENOMEM; - - qrecord->bytenr = bytenr; - qrecord->num_bytes = num_bytes; - qrecord->old_roots = NULL; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, - delayed_refs, qrecord)) - kfree(qrecord); - spin_unlock(&delayed_refs->lock); - - return 0; -} - static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) @@ -8583,7 +8566,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + bytenr, num_bytes, GFP_NOFS); if (ret) return ret; } @@ -8732,8 +8716,9 @@ walk_down: btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - ret = record_one_subtree_extent(trans, root, child_bytenr, - root->nodesize); + ret = btrfs_qgroup_insert_dirty_extent(trans, + root->fs_info, child_bytenr, + root->nodesize, GFP_NOFS); if (ret) goto out; } @@ -9906,6 +9891,7 @@ static int find_first_block_group(struct btrfs_root *root, } else { ret = 0; } + free_extent_map(em); goto out; } path->slots[0]++; @@ -9942,6 +9928,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) block_group->iref = 0; block_group->inode = NULL; spin_unlock(&block_group->lock); + ASSERT(block_group->io_ctl.inode == NULL); iput(inode); last = block_group->key.objectid + block_group->key.offset; btrfs_put_block_group(block_group); @@ -9999,6 +9986,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) free_excluded_extents(info->extent_root, block_group); btrfs_remove_free_space_cache(block_group); + ASSERT(list_empty(&block_group->dirty_list)); + ASSERT(list_empty(&block_group->io_list)); + ASSERT(list_empty(&block_group->bg_list)); + ASSERT(atomic_read(&block_group->count) == 1); btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bc2729a7612d..28cd88fccc7e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,6 +20,7 @@ #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) +#define EXTENT_CLEAR_DATA_RESV (1U << 17) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9404121fd5f7..4843cb994835 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2033,6 +2033,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * An ordered extent might have started before and completed + * already with io errors, in which case the inode was not + * updated and we end up here. So check the inode's mapping + * flags for any errors that might have happened while doing + * writeback of file data. + */ + ret = filemap_check_errors(inode->i_mapping); inode_unlock(inode); goto out; } @@ -2062,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - btrfs_init_log_ctx(&ctx); + btrfs_init_log_ctx(&ctx, inode); ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { @@ -2667,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start = round_down(offset, blocksize); alloc_end = round_up(offset + len, blocksize); + cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2759,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); - cur_offset = alloc_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); @@ -2786,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte - cur_offset); if (ret < 0) break; + } else { + /* + * Do not need to reserve unwritten extent for this + * range, free reserved data space first, otherwise + * it'll result in false ENOSPC error. + */ + btrfs_free_reserved_data_space(inode, cur_offset, + last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2803,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->start, range->len, 1 << inode->i_blkbits, offset + len, &alloc_hint); + else + btrfs_free_reserved_data_space(inode, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2837,18 +2856,11 @@ out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_KERNEL); out: - /* - * As we waited the extent range, the data_rsv_map must be empty - * in the range, as written data range will be released from it. - * And for prealloacted extent, it will also be released when - * its metadata is written. - * So this is completely used as cleanup. - */ - btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); inode_unlock(inode); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - alloc_start); + if (ret != 0) + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - cur_offset); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index aa6fabaee72e..359ee861b5a4 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -495,10 +495,9 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_space(inode, 0, prealloc); + btrfs_delalloc_release_metadata(inode, prealloc); goto out_put; } - btrfs_free_reserved_data_space(inode, 0, prealloc); ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bfa0b07160d5..af7039a82a29 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -566,6 +566,8 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); goto free_pages_out; } } @@ -742,7 +744,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); - ret = btrfs_reserve_extent(root, + ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); @@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); - + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode, unsigned long op; cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) @@ -1489,8 +1492,10 @@ out_check: extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC, PAGE_UNLOCK | - PAGE_SET_PRIVATE2); + EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; @@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode, return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE)) + && do_list && !(state->state & EXTENT_NORESERVE) + && (*bits & (EXTENT_DO_ACCOUNTING | + EXTENT_CLEAR_DATA_RESV))) btrfs_free_reserved_data_space_noquota(inode, state->start, len); @@ -3435,10 +3442,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); ret = PTR_ERR_OR_ZERO(inode); - if (ret && ret != -ESTALE) + if (ret && ret != -ENOENT) goto out; - if (ret == -ESTALE && root == root->fs_info->tree_root) { + if (ret == -ENOENT && root == root->fs_info->tree_root) { struct btrfs_root *dead_root; struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; @@ -3474,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * Inode is already gone but the orphan item is still there, * kill the orphan item. */ - if (ret == -ESTALE) { + if (ret == -ENOENT) { trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3633,7 +3640,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* * read an inode from the btree into the in-memory inode */ -static void btrfs_read_locked_inode(struct inode *inode) +static int btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -3652,14 +3659,19 @@ static void btrfs_read_locked_inode(struct inode *inode) filled = true; path = btrfs_alloc_path(); - if (!path) + if (!path) { + ret = -ENOMEM; goto make_bad; + } memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); - if (ret) + if (ret) { + if (ret > 0) + ret = -ENOENT; goto make_bad; + } leaf = path->nodes[0]; @@ -3812,11 +3824,12 @@ cache_acl: } btrfs_update_iflags(inode); - return; + return 0; make_bad: btrfs_free_path(path); make_bad_inode(inode); + return ret; } /* @@ -4204,6 +4217,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; + u64 last_unlink_trans; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4226,11 +4240,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (err) goto out; + last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; + /* now the directory is empty */ err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); - if (!err) + if (!err) { btrfs_i_size_write(inode, 0); + /* + * Propagate the last_unlink_trans value of the deleted dir to + * its parent directory. This is to prevent an unrecoverable + * log tree in the case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + */ + if (last_unlink_trans >= trans->transid) + BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; + } out: btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); @@ -5606,7 +5636,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - btrfs_read_locked_inode(inode); + int ret; + + ret = btrfs_read_locked_inode(inode); if (!is_bad_inode(inode)) { inode_tree_add(inode); unlock_new_inode(inode); @@ -5615,7 +5647,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, } else { unlock_new_inode(inode); iput(inode); - inode = ERR_PTR(-ESTALE); + ASSERT(ret < 0); + inode = ERR_PTR(ret < 0 ? ret : -ESTALE); } } @@ -7225,7 +7258,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, + ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); @@ -7725,6 +7758,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ret = PTR_ERR(em2); goto unlock_err; } + /* + * For inode marked NODATACOW or extent marked PREALLOC, + * use the existing or preallocated extent, so does not + * need to adjust btrfs_space_info's bytes_may_use. + */ + btrfs_free_reserved_data_space_noquota(inode, + start, len); goto unlock; } } @@ -7759,7 +7799,6 @@ unlock: i_size_write(inode, start + len); adjust_dio_outstanding_extents(inode, dio_data, len); - btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; dio_data->unsubmitted_oe_range_end = start + len; @@ -10280,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; + u64 end = start + num_bytes - 1; if (trans) own_trans = false; @@ -10301,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, * sized chunks. */ cur_bytes = min(cur_bytes, last_alloc); - ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, - *alloc_hint, &ins, 1, 0); + ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, + min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -10388,6 +10428,9 @@ next: if (own_trans) btrfs_end_transaction(trans, root); } + if (cur_offset < end) + btrfs_free_reserved_data_space(inode, cur_offset, + end - cur_offset + 1); return ret; } @@ -10500,21 +10543,6 @@ out_inode: } -/* Inspired by filemap_check_errors() */ -int btrfs_inode_check_errors(struct inode *inode) -{ - int ret = 0; - - if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) && - test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags)) - ret = -ENOSPC; - if (test_bit(AS_EIO, &inode->i_mapping->flags) && - test_and_clear_bit(AS_EIO, &inode->i_mapping->flags)) - ret = -EIO; - - return ret; -} - static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14ed1e9e6bc8..7fd939bfbd99 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1634,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, int namelen; int ret = 0; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + ret = mnt_want_write_file(file); if (ret) goto out; @@ -1691,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, struct btrfs_ioctl_vol_args *vol_args; int ret; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1714,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -2357,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int ret; int err = 0; + if (!S_ISDIR(dir->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -5084,7 +5096,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_qgroup_wait_for_completion(root->fs_info); + return btrfs_qgroup_wait_for_completion(root->fs_info, true); } static long _btrfs_ioctl_set_received_subvol(struct file *file, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 93ee1c18ef9d..8db2e29fdcf4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, goto out; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) +int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; @@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, else if (bytenr > entry->bytenr) p = &(*p)->rb_right; else - return entry; + return 1; } rb_link_node(&record->node, parent_node, p); rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); - return NULL; + return 0; +} + +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag) +{ + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0) + return 0; + if (WARN_ON(trans == NULL)) + return -EINVAL; + record = kmalloc(sizeof(*record), gfp_flag); + if (!record) + return -ENOMEM; + + delayed_refs = &trans->transaction->delayed_refs; + record->bytenr = bytenr; + record->num_bytes = num_bytes; + record->old_roots = NULL; + + spin_lock(&delayed_refs->lock); + ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs, + record); + spin_unlock(&delayed_refs->lock); + if (ret > 0) + kfree(record); + return 0; } #define UPDATE_NEW 0 @@ -2303,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) int err = -ENOMEM; int ret = 0; + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; + mutex_unlock(&fs_info->qgroup_rescan_lock); + path = btrfs_alloc_path(); if (!path) goto out; @@ -2369,6 +2402,9 @@ out: } done: + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = false; + mutex_unlock(&fs_info->qgroup_rescan_lock); complete_all(&fs_info->qgroup_rescan_completion); } @@ -2487,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible) { int running; int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + running = fs_info->qgroup_rescan_running; spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - if (running) + if (!running) + return 0; + + if (interruptible) ret = wait_for_completion_interruptible( &fs_info->qgroup_rescan_completion); + else + wait_for_completion(&fs_info->qgroup_rescan_completion); return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 710887c06aaf..1bc64c864b62 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, @@ -63,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * No lock version, caller must acquire delayed ref lock and allocate memory. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ +int btrfs_qgroup_insert_dirty_extent_nolock( + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); + +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * Better encapsulated version. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag); + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b26a5aea41b4..c0c13dc6fe12 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -31,6 +31,7 @@ #include "async-thread.h" #include "free-space-cache.h" #include "inode-map.h" +#include "qgroup.h" /* * backref_node, mapping_node and tree_block start with this @@ -3037,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 num_bytes; int nr = 0; int ret = 0; + u64 prealloc_start = cluster->start - offset; + u64 prealloc_end = cluster->end - offset; + u64 cur_offset; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, cluster->start, - cluster->end + 1 - cluster->start); + ret = btrfs_check_data_free_space(inode, prealloc_start, + prealloc_end + 1 - prealloc_start); if (ret) goto out; + cur_offset = prealloc_start; while (nr < cluster->nr) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -3055,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; + if (cur_offset < start) + btrfs_free_reserved_data_space(inode, cur_offset, + start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); + cur_offset = end + 1; unlock_extent(&BTRFS_I(inode)->io_tree, start, end); if (ret) break; nr++; } - btrfs_free_reserved_data_space(inode, cluster->start, - cluster->end + 1 - cluster->start); + if (cur_offset < prealloc_end) + btrfs_free_reserved_data_space(inode, cur_offset, + prealloc_end + 1 - cur_offset); out: inode_unlock(inode); return ret; @@ -3916,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc) return 0; } +/* + * Qgroup fixer for data chunk relocation. + * The data relocation is done in the following steps + * 1) Copy data extents into data reloc tree + * 2) Create tree reloc tree(special snapshot) for related subvolumes + * 3) Modify file extents in tree reloc tree + * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks + * + * The problem is, data and tree reloc tree are not accounted to qgroup, + * and 4) will only info qgroup to track tree blocks change, not file extents + * in the tree blocks. + * + * The good news is, related data extents are all in data reloc tree, so we + * only need to info qgroup to track all file extents in data reloc tree + * before commit trans. + */ +static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct inode *inode = rc->data_inode; + struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + /* + * Only for stage where we update data pointers the qgroup fix is + * valid. + * For MOVING_DATA stage, we will miss the timing of swapping tree + * blocks, and won't fix it. + */ + if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1); + while (1) { + struct btrfs_file_extent_item *fi; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid > btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + goto next; + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(path->nodes[0], fi) != + BTRFS_FILE_EXTENT_REG) + goto next; + ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info, + btrfs_file_extent_disk_bytenr(path->nodes[0], fi), + btrfs_file_extent_disk_num_bytes(path->nodes[0], fi), + GFP_NOFS); + if (ret < 0) + break; +next: + ret = btrfs_next_item(data_reloc_root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + } + unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1); +out: + btrfs_free_path(path); + return ret; +} + static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; @@ -4102,10 +4196,18 @@ restart: /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { err = PTR_ERR(trans); - else - btrfs_commit_transaction(trans, rc->extent_root); + goto out_free; + } + ret = qgroup_fix_relocated_data_extents(trans, rc); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + if (!err) + err = ret; + goto out_free; + } + btrfs_commit_transaction(trans, rc->extent_root); out_free: btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); btrfs_free_path(path); @@ -4468,10 +4570,16 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { err = PTR_ERR(trans); - else - err = btrfs_commit_transaction(trans, rc->extent_root); + goto out_free; + } + err = qgroup_fix_relocated_data_extents(trans, rc); + if (err < 0) { + btrfs_abort_transaction(trans, err); + goto out_free; + } + err = btrfs_commit_transaction(trans, rc->extent_root); out_free: kfree(rc); out: diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7fd7e1830cfe..091296062456 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid = key.offset; key.offset++; + /* + * The root might have been inserted already, as before we look + * for orphan roots, log replay might have happened, which + * triggers a transaction commit and qgroup accounting, which + * in turn reads and inserts fs roots while doing backref + * walking. + */ + root = btrfs_lookup_fs_root(tree_root->fs_info, + root_key.objectid); + if (root) { + WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state)); + if (btrfs_root_refs(&root->root_item) == 0) + btrfs_add_dead_root(root); + continue; + } + root = btrfs_read_fs_root(tree_root, &root_key); err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { @@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); - /* - * The root might have been inserted already, as before we look - * for orphan roots, log replay might have happened, which - * triggers a transaction commit and qgroup accounting, which - * in turn reads and inserts fs roots while doing backref - * walking. - */ - if (err == -EEXIST) - err = 0; if (err) { + BUG_ON(err == -EEXIST); btrfs_free_fs_root(root); break; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b71dd298385c..1379e59277e2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -231,7 +231,6 @@ struct pending_dir_move { u64 parent_ino; u64 ino; u64 gen; - bool is_orphan; struct list_head update_refs; }; @@ -274,6 +273,39 @@ struct name_cache_entry { char name[]; }; +static void inconsistent_snapshot_error(struct send_ctx *sctx, + enum btrfs_compare_tree_result result, + const char *what) +{ + const char *result_string; + + switch (result) { + case BTRFS_COMPARE_TREE_NEW: + result_string = "new"; + break; + case BTRFS_COMPARE_TREE_DELETED: + result_string = "deleted"; + break; + case BTRFS_COMPARE_TREE_CHANGED: + result_string = "updated"; + break; + case BTRFS_COMPARE_TREE_SAME: + ASSERT(0); + result_string = "unchanged"; + break; + default: + ASSERT(0); + result_string = "unexpected"; + } + + btrfs_err(sctx->send_root->fs_info, + "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", + result_string, what, sctx->cmp_key->objectid, + sctx->send_root->root_key.objectid, + (sctx->parent_root ? + sctx->parent_root->root_key.objectid : 0)); +} + static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); static struct waiting_dir_move * @@ -1861,7 +1893,8 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, * was already unlinked/moved, so we can safely assume that we will not * overwrite anything at this point in time. */ - if (other_inode > sctx->send_progress) { + if (other_inode > sctx->send_progress || + is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, who_gen, NULL, NULL, NULL, NULL); if (ret < 0) @@ -2502,6 +2535,8 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret > 0) + ret = -ENOENT; if (ret < 0) goto out; @@ -2947,6 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, } if (loc.objectid > send_progress) { + struct orphan_dir_info *odi; + + odi = get_orphan_dir_info(sctx, dir); + free_orphan_dir_info(sctx, odi); ret = 0; goto out; } @@ -3047,7 +3086,6 @@ static int add_pending_dir_move(struct send_ctx *sctx, pm->parent_ino = parent_ino; pm->ino = ino; pm->gen = ino_gen; - pm->is_orphan = is_orphan; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); @@ -3113,6 +3151,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } +static int path_loop(struct send_ctx *sctx, struct fs_path *name, + u64 ino, u64 gen, u64 *ancestor_ino) +{ + int ret = 0; + u64 parent_inode = 0; + u64 parent_gen = 0; + u64 start_ino = ino; + + *ancestor_ino = 0; + while (ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) + break; + if (is_waiting_for_move(sctx, ino)) { + if (*ancestor_ino == 0) + *ancestor_ino = ino; + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret > 0) { + ret = 0; + break; + } + } + if (ret < 0) + break; + if (parent_inode == start_ino) { + ret = 1; + if (*ancestor_ino == 0) + *ancestor_ino = ino; + break; + } + ino = parent_inode; + gen = parent_gen; + } + return ret; +} + static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3123,6 +3203,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) u64 parent_ino, parent_gen; struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; + u64 ancestor; + bool is_orphan; int ret; name = fs_path_alloc(); @@ -3135,9 +3217,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) dm = get_waiting_dir_move(sctx, pm->ino); ASSERT(dm); rmdir_ino = dm->rmdir_ino; + is_orphan = dm->orphanized; free_waiting_dir_move(sctx, dm); - if (pm->is_orphan) { + if (is_orphan) { ret = gen_unique_name(sctx, pm->ino, pm->gen, from_path); } else { @@ -3155,6 +3238,24 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) goto out; sctx->send_progress = sctx->cur_ino + 1; + ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); + if (ret < 0) + goto out; + if (ret) { + LIST_HEAD(deleted_refs); + ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); + ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, + &pm->update_refs, &deleted_refs, + is_orphan); + if (ret < 0) + goto out; + if (rmdir_ino) { + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + dm->rmdir_ino = rmdir_ino; + } + goto out; + } fs_path_reset(name); to_path = name; name = NULL; @@ -3174,7 +3275,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) /* already deleted */ goto finish; } - ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); + ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino); if (ret < 0) goto out; if (!ret) @@ -3204,8 +3305,18 @@ finish: * and old parent(s). */ list_for_each_entry(cur, &pm->update_refs, list) { - if (cur->dir == rmdir_ino) + /* + * The parent inode might have been deleted in the send snapshot + */ + ret = get_inode_info(sctx->send_root, cur->dir, NULL, + NULL, NULL, NULL, NULL, NULL); + if (ret == -ENOENT) { + ret = 0; continue; + } + if (ret < 0) + goto out; + ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; @@ -3325,6 +3436,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, u64 left_gen; u64 right_gen; int ret = 0; + struct waiting_dir_move *wdm; if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) return 0; @@ -3383,7 +3495,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - if (is_waiting_for_move(sctx, di_key.objectid)) { + wdm = get_waiting_dir_move(sctx, di_key.objectid); + if (wdm && !wdm->orphanized) { ret = add_pending_dir_move(sctx, sctx->cur_ino, sctx->cur_inode_gen, @@ -3470,7 +3583,8 @@ static int wait_for_parent_move(struct send_ctx *sctx, ret = is_ancestor(sctx->parent_root, sctx->cur_ino, sctx->cur_inode_gen, ino, path_before); - break; + if (ret) + break; } fs_path_reset(path_before); @@ -3643,11 +3757,26 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); goto out; if (ret) { struct name_cache_entry *nce; + struct waiting_dir_move *wdm; ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) goto out; + + /* + * If ow_inode has its rename operation delayed + * make sure that its orphanized name is used in + * the source path when performing its rename + * operation. + */ + if (is_waiting_for_move(sctx, ow_inode)) { + wdm = get_waiting_dir_move(sctx, + ow_inode); + ASSERT(wdm); + wdm->orphanized = true; + } + /* * Make sure we clear our orphanized inode's * name from the name cache. This is because the @@ -3663,6 +3792,19 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); name_cache_delete(sctx, nce); kfree(nce); } + + /* + * ow_inode might currently be an ancestor of + * cur_ino, therefore compute valid_path (the + * current path of cur_ino) again because it + * might contain the pre-orphanization name of + * ow_inode, which is no longer valid. + */ + fs_path_reset(valid_path); + ret = get_cur_path(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; } else { ret = send_unlink(sctx, cur->full_path); if (ret < 0) @@ -4126,10 +4268,12 @@ static int process_all_refs(struct send_ctx *sctx, } btrfs_release_path(path); + /* + * We don't actually care about pending_move as we are simply + * re-creating this inode and will be rename'ing it into place once we + * rename the parent directory. + */ ret = process_recorded_refs(sctx, &pending_move); - /* Only applicable to an incremental send. */ - ASSERT(pending_move == 0); - out: btrfs_free_path(path); return ret; @@ -4185,7 +4329,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, int ret; struct send_ctx *sctx = ctx; struct fs_path *p; - posix_acl_xattr_header dummy_acl; + struct posix_acl_xattr_header dummy_acl; p = fs_path_alloc(); if (!p) @@ -5602,7 +5746,10 @@ static int changed_ref(struct send_ctx *sctx, { int ret = 0; - BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "reference"); + return -EIO; + } if (!sctx->cur_inode_new_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { @@ -5627,7 +5774,10 @@ static int changed_xattr(struct send_ctx *sctx, { int ret = 0; - BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "xattr"); + return -EIO; + } if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) @@ -5651,7 +5801,10 @@ static int changed_extent(struct send_ctx *sctx, { int ret = 0; - BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "extent"); + return -EIO; + } if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result != BTRFS_COMPARE_TREE_DELETED) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 864ce334f696..4071fe2bd098 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2241,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_sb(sb)->tree_root; + root->fs_info->fs_frozen = 1; + /* + * We don't need a barrier here, we'll wait for any transaction that + * could be in progress on other threads (and do delayed iputs that + * we want to avoid on a frozen filesystem), or do the commit + * ourselves. + */ trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ @@ -2251,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans, root); } +static int btrfs_unfreeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb)->tree_root; + + root->fs_info->fs_frozen = 0; + return 0; +} + static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -2299,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, + .unfreeze_fs = btrfs_unfreeze, }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9cca0a721961..95d41919d034 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2278,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); + /* + * If fs has been frozen, we can not handle delayed iputs, otherwise + * it'll result in deadlock about SB_FREEZE_FS. + */ if (current != root->fs_info->transaction_kthread && - current != root->fs_info->cleaner_kthread) + current != root->fs_info->cleaner_kthread && + !root->fs_info->fs_frozen) btrfs_run_delayed_iputs(root); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d31a0c4f56be..8a84ebd8e7cc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "backref.h" #include "hash.h" #include "compression.h" +#include "qgroup.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * Manually record dirty extent, as here we did a shallow + * file extent item copy and skip normal backref update, + * but modifying extent tree all by ourselves. + * So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree + * (doesn't affect qgroup) to fs/file tree(affects qgroup) + */ + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + btrfs_file_extent_disk_bytenr(eb, item), + btrfs_file_extent_disk_num_bytes(eb, item), + GFP_NOFS); + if (ret < 0) + goto out; + if (ins.objectid > 0) { u64 csum_start; u64 csum_end; @@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); - btrfs_init_log_ctx(&root_log_ctx); + btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); @@ -2851,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; @@ -3944,7 +3961,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, * i_mapping flags, so that the next fsync won't get * an outdated io error too. */ - btrfs_inode_check_errors(inode); + filemap_check_errors(inode->i_mapping); *ordered_io_error = true; break; } @@ -4181,7 +4198,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * without writing to the log tree and the fsync must report the * file data write error and not commit the current transaction. */ - ret = btrfs_inode_check_errors(inode); + ret = filemap_check_errors(inode->i_mapping); if (ret) ctx->io_err = ret; process: @@ -4469,7 +4486,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, static int btrfs_check_ref_name_override(struct extent_buffer *eb, const int slot, const struct btrfs_key *key, - struct inode *inode) + struct inode *inode, + u64 *other_ino) { int ret; struct btrfs_path *search_path; @@ -4528,7 +4546,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, search_path, parent, name, this_name_len, 0); if (di && !IS_ERR(di)) { - ret = 1; + struct btrfs_key di_key; + + btrfs_dir_item_key_to_cpu(search_path->nodes[0], + di, &di_key); + if (di_key.type == BTRFS_INODE_ITEM_KEY) { + ret = 1; + *other_ino = di_key.objectid; + } else { + ret = -EAGAIN; + } goto out; } else if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -4722,16 +4749,72 @@ again: if ((min_key.type == BTRFS_INODE_REF_KEY || min_key.type == BTRFS_INODE_EXTREF_KEY) && BTRFS_I(inode)->generation == trans->transid) { + u64 other_ino = 0; + ret = btrfs_check_ref_name_override(path->nodes[0], path->slots[0], - &min_key, inode); + &min_key, inode, + &other_ino); if (ret < 0) { err = ret; goto out_unlock; - } else if (ret > 0) { - err = 1; - btrfs_set_log_full_commit(root->fs_info, trans); - goto out_unlock; + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(ctx->inode)) { + struct btrfs_key inode_key; + struct inode *other_inode; + + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, + logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + btrfs_release_path(path); + inode_key.objectid = other_ino; + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + other_inode = btrfs_iget(root->fs_info->sb, + &inode_key, root, + NULL); + /* + * If the other inode that had a conflicting dir + * entry was deleted in the current transaction, + * we don't need to do more work nor fallback to + * a transaction commit. + */ + if (IS_ERR(other_inode) && + PTR_ERR(other_inode) == -ENOENT) { + goto next_key; + } else if (IS_ERR(other_inode)) { + err = PTR_ERR(other_inode); + goto out_unlock; + } + /* + * We are safe logging the other inode without + * acquiring its i_mutex as long as we log with + * the LOG_INODE_EXISTS mode. We're safe against + * concurrent renames of the other inode as well + * because during a rename we pin the log and + * update the log with the new name before we + * unpin it. + */ + err = btrfs_log_inode(trans, root, other_inode, + LOG_INODE_EXISTS, + 0, LLONG_MAX, ctx); + iput(other_inode); + if (err) + goto out_unlock; + else + goto next_key; } } @@ -4799,7 +4882,7 @@ next_slot: ins_nr = 0; } btrfs_release_path(path); - +next_key: if (min_key.offset < (u64)-1) { min_key.offset++; } else if (min_key.type < max_key.type) { @@ -4993,8 +5076,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; - if (IS_ROOT(parent)) + if (IS_ROOT(parent)) { + inode = d_inode(parent); + if (btrfs_must_commit_transaction(trans, inode)) + ret = 1; break; + } parent = dget_parent(parent); dput(old_parent); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a9f1b75d080d..ab858e31ccbc 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -30,15 +30,18 @@ struct btrfs_log_ctx { int log_transid; int io_err; bool log_new_dentries; + struct inode *inode; struct list_head list; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, + struct inode *inode) { ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; ctx->log_new_dentries = false; + ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 51f125508771..035efce603a9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -834,10 +834,6 @@ static void __free_device(struct work_struct *work) struct btrfs_device *device; device = container_of(work, struct btrfs_device, rcu_work); - - if (device->bdev) - blkdev_put(device->bdev, device->mode); - rcu_string_free(device->name); kfree(device); } @@ -852,6 +848,17 @@ static void free_device(struct rcu_head *head) schedule_work(&device->rcu_work); } +static void btrfs_close_bdev(struct btrfs_device *device) +{ + if (device->bdev && device->writeable) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + + if (device->bdev) + blkdev_put(device->bdev, device->mode); +} + static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; @@ -870,10 +877,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->missing) fs_devices->missing_devices--; - if (device->bdev && device->writeable) { - sync_blockdev(device->bdev); - invalidate_bdev(device->bdev); - } + btrfs_close_bdev(device); new_device = btrfs_alloc_device(NULL, &device->devid, device->uuid); @@ -1932,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); } + btrfs_close_bdev(device); + call_rcu(&device->rcu, free_device); num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; @@ -2025,6 +2031,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, /* zero out the old super if it is writable */ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); } + + btrfs_close_bdev(srcdev); + call_rcu(&srcdev->rcu, free_device); /* @@ -2080,6 +2089,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, * the device_list_mutex lock. */ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + + btrfs_close_bdev(tgtdev); call_rcu(&tgtdev->rcu, free_device); } diff --git a/fs/buffer.c b/fs/buffer.c index 9c8eb9b6db6a..7dad8713fac8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1078,7 +1078,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) return grow_dev_page(bdev, block, index, size, sizebits, gfp); } -struct buffer_head * +static struct buffer_head * __getblk_slow(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { @@ -1109,7 +1109,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, free_more_memory(); } } -EXPORT_SYMBOL(__getblk_slow); /* * The relationship between dirty buffers and dirty pages: diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index ce5f345d70f5..e7f16a77a22a 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -253,6 +253,8 @@ static void cachefiles_drop_object(struct fscache_object *_object) struct cachefiles_object *object; struct cachefiles_cache *cache; const struct cred *saved_cred; + struct inode *inode; + blkcnt_t i_blocks = 0; ASSERT(_object); @@ -279,6 +281,10 @@ static void cachefiles_drop_object(struct fscache_object *_object) _object != cache->cache.fsdef ) { _debug("- retire object OBJ%x", object->fscache.debug_id); + inode = d_backing_inode(object->dentry); + if (inode) + i_blocks = inode->i_blocks; + cachefiles_begin_secure(cache, &saved_cred); cachefiles_delete_object(cache, object); cachefiles_end_secure(cache, saved_cred); @@ -292,7 +298,7 @@ static void cachefiles_drop_object(struct fscache_object *_object) /* note that the object is now inactive */ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) - cachefiles_mark_object_inactive(cache, object); + cachefiles_mark_object_inactive(cache, object, i_blocks); dput(object->dentry); object->dentry = NULL; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 2fcde1a34b7c..cd1effee8a49 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -160,7 +160,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); * namei.c */ extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object); + struct cachefiles_object *object, + blkcnt_t i_blocks); extern int cachefiles_delete_object(struct cachefiles_cache *cache, struct cachefiles_object *object); extern int cachefiles_walk_to_object(struct cachefiles_object *parent, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 3f7c2cd41f8f..c6ee4b5fb7e6 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -261,10 +261,9 @@ requeue: * Mark an object as being inactive. */ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object) + struct cachefiles_object *object, + blkcnt_t i_blocks) { - blkcnt_t i_blocks = d_backing_inode(object->dentry)->i_blocks; - write_lock(&cache->active_lock); rb_erase(&object->active_node, &cache->active_nodes); clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); @@ -707,7 +706,8 @@ mark_active_timed_out: check_error: _debug("check error %d", ret); - cachefiles_mark_object_inactive(cache, object); + cachefiles_mark_object_inactive( + cache, object, d_backing_inode(object->dentry)->i_blocks); release_dentry: dput(object->dentry); object->dentry = NULL; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 99115cae1652..16e6ded0b7f2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1347,9 +1347,12 @@ void ceph_flush_snaps(struct ceph_inode_info *ci, { struct inode *inode = &ci->vfs_inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - struct ceph_mds_session *session = *psession; + struct ceph_mds_session *session = NULL; int mds; + dout("ceph_flush_snaps %p\n", inode); + if (psession) + session = *psession; retry: spin_lock(&ci->i_ceph_lock); if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index c64a0b794d49..df4b3e6fa563 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -597,7 +597,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) if (is_hash_order(new_pos)) { /* no need to reset last_name for a forward seek when * dentries are sotred in hash order */ - } else if (fi->frag |= fpos_frag(new_pos)) { + } else if (fi->frag != fpos_frag(new_pos)) { return true; } rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fa59a85226b2..f72d4ae303b2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2759,6 +2759,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } else { path = NULL; pathlen = 0; + pathbase = 0; } spin_lock(&ci->i_ceph_lock); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 6bbec5e784cd..14ae4b8e1a3c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -609,6 +609,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *s, *p; char sep; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + return dget(sb->s_root); + full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); if (full_path == NULL) @@ -686,26 +689,22 @@ cifs_do_mount(struct file_system_type *fs_type, cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); if (cifs_sb->mountdata == NULL) { root = ERR_PTR(-ENOMEM); - goto out_cifs_sb; + goto out_free; } - if (volume_info->prepath) { - cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL); - if (cifs_sb->prepath == NULL) { - root = ERR_PTR(-ENOMEM); - goto out_cifs_sb; - } + rc = cifs_setup_cifs_sb(volume_info, cifs_sb); + if (rc) { + root = ERR_PTR(rc); + goto out_free; } - cifs_setup_cifs_sb(volume_info, cifs_sb); - rc = cifs_mount(cifs_sb, volume_info); if (rc) { if (!(flags & MS_SILENT)) cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", rc); root = ERR_PTR(rc); - goto out_mountdata; + goto out_free; } mnt_data.vol = volume_info; @@ -735,11 +734,7 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) - root = dget(sb->s_root); - else - root = cifs_get_root(volume_info, sb); - + root = cifs_get_root(volume_info, sb); if (IS_ERR(root)) goto out_super; @@ -752,9 +747,9 @@ out: cifs_cleanup_volume_info(volume_info); return root; -out_mountdata: +out_free: + kfree(cifs_sb->prepath); kfree(cifs_sb->mountdata); -out_cifs_sb: kfree(cifs_sb); out_nls: unload_nls(volume_info->local_nls); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 9dcf974acc47..c9c00a862036 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,16 @@ cifs_uniqueid_to_ino_t(u64 fileid) } +static inline void cifs_set_time(struct dentry *dentry, unsigned long time) +{ + dentry->d_fsdata = (void *) time; +} + +static inline unsigned long cifs_get_time(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1243bd326591..4ead72a001f9 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -184,7 +184,7 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int to_read); -extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, +extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); @@ -392,8 +392,7 @@ extern int CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *return_buf_type); extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, - unsigned int *nbytes, const char *buf, - const char __user *ubuf, const int long_op); + unsigned int *nbytes, const char *buf); extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, const int nvec); extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d47197ea4ab6..f82d2823622f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1228,7 +1228,6 @@ OldOpenRetry: inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); - /* long_op set to 1 to allow for oplock break timeouts */ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *)pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); @@ -1768,8 +1767,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, - unsigned int *nbytes, const char *buf, - const char __user *ubuf, const int long_op) + unsigned int *nbytes, const char *buf) { int rc = -EACCES; WRITE_REQ *pSMB = NULL; @@ -1838,12 +1836,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); if (buf) memcpy(pSMB->Data, buf, bytes_sent); - else if (ubuf) { - if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) { - cifs_buf_release(pSMB); - return -EFAULT; - } - } else if (count != 0) { + else if (count != 0) { /* No buffer */ cifs_buf_release(pSMB); return -EINVAL; @@ -1867,7 +1860,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, } rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, long_op); + (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); if (rc) { cifs_dbg(FYI, "Send error in write = %d\n", rc); @@ -3334,7 +3327,7 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, #ifdef CONFIG_CIFS_POSIX /*Convert an Access Control Entry from wire format to local POSIX xattr format*/ -static void cifs_convert_ace(posix_acl_xattr_entry *ace, +static void cifs_convert_ace(struct posix_acl_xattr_entry *ace, struct cifs_posix_ace *cifs_ace) { /* u8 cifs fields do not need le conversion */ @@ -3358,7 +3351,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, __u16 count; struct cifs_posix_ace *pACE; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src; - posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)trgt; + struct posix_acl_xattr_header *local_acl = (void *)trgt; if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) return -EOPNOTSUPP; @@ -3396,9 +3389,11 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, } else if (size > buflen) { return -ERANGE; } else /* buffer big enough */ { + struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); + local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (i = 0; i < count ; i++) { - cifs_convert_ace(&local_acl->a_entries[i], pACE); + cifs_convert_ace(&ace[i], pACE); pACE++; } } @@ -3406,7 +3401,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, } static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, - const posix_acl_xattr_entry *local_ace) + const struct posix_acl_xattr_entry *local_ace) { __u16 rc = 0; /* 0 = ACL converted ok */ @@ -3431,7 +3426,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, { __u16 rc = 0; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; - posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)pACL; + struct posix_acl_xattr_header *local_acl = (void *)pACL; int count; int i; @@ -3459,7 +3454,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, } for (i = 0; i < count; i++) { rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], - &local_acl->a_entries[i]); + (struct posix_acl_xattr_entry *)(local_acl + 1)); if (rc != 0) { /* ACE not converted */ break; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7ae03283bd61..2e4f4bad8b1e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2781,6 +2781,24 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) return 1; } +static int +match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) +{ + struct cifs_sb_info *old = CIFS_SB(sb); + struct cifs_sb_info *new = mnt_data->cifs_sb; + + if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { + if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) + return 0; + /* The prepath should be null terminated strings */ + if (strcmp(new->prepath, old->prepath)) + return 0; + + return 1; + } + return 0; +} + int cifs_match_super(struct super_block *sb, void *data) { @@ -2808,7 +2826,8 @@ cifs_match_super(struct super_block *sb, void *data) if (!match_server(tcp_srv, volume_info) || !match_session(ses, volume_info) || - !match_tcon(tcon, volume_info->UNC)) { + !match_tcon(tcon, volume_info->UNC) || + !match_prepath(sb, mnt_data)) { rc = 0; goto out; } @@ -3222,7 +3241,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, } } -void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, +int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb) { INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); @@ -3316,6 +3335,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n"); + + if (pvolume_info->prepath) { + cifs_sb->prepath = kstrdup(pvolume_info->prepath, GFP_KERNEL); + if (cifs_sb->prepath == NULL) + return -ENOMEM; + } + + return 0; } static void diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 4716c54dbfc6..789ff1df2d8d 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -40,7 +40,7 @@ renew_parental_timestamps(struct dentry *direntry) /* BB check if there is a way to get the kernel to do this or if we really need this */ do { - direntry->d_time = jiffies; + cifs_set_time(direntry, jiffies); direntry = direntry->d_parent; } while (!IS_ROOT(direntry)); } @@ -802,7 +802,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } else if (rc == -ENOENT) { rc = 0; - direntry->d_time = jiffies; + cifs_set_time(direntry, jiffies); d_add(direntry, NULL); /* if it was once a directory (but how can we tell?) we could do shrink_dcache_parent(direntry); */ @@ -862,7 +862,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; - if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled) + if (time_after(jiffies, cifs_get_time(direntry) + HZ) || !lookupCacheEnabled) return 0; return 1; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 579e41b350a2..42b99af74e0a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2478,7 +2478,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, size_t cur_len; unsigned long nr_pages, num_pages, i; struct cifs_writedata *wdata; - struct iov_iter saved_from; + struct iov_iter saved_from = *from; loff_t saved_offset = offset; pid_t pid; struct TCP_Server_Info *server; @@ -2489,7 +2489,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, pid = current->tgid; server = tlink_tcon(open_file->tlink)->ses->server; - memcpy(&saved_from, from, sizeof(struct iov_iter)); do { unsigned int wsize, credits; @@ -2551,8 +2550,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, kref_put(&wdata->refcount, cifs_uncached_writedata_release); if (rc == -EAGAIN) { - memcpy(from, &saved_from, - sizeof(struct iov_iter)); + *from = saved_from; iov_iter_advance(from, offset - saved_offset); continue; } @@ -2576,7 +2574,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) struct cifs_sb_info *cifs_sb; struct cifs_writedata *wdata, *tmp; struct list_head wdata_list; - struct iov_iter saved_from; + struct iov_iter saved_from = *from; int rc; /* @@ -2597,8 +2595,6 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) if (!tcon->ses->server->ops->async_writev) return -ENOSYS; - memcpy(&saved_from, from, sizeof(struct iov_iter)); - rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from, open_file, cifs_sb, &wdata_list); @@ -2631,13 +2627,11 @@ restart_loop: /* resend call if it's a retryable error */ if (rc == -EAGAIN) { struct list_head tmp_list; - struct iov_iter tmp_from; + struct iov_iter tmp_from = saved_from; INIT_LIST_HEAD(&tmp_list); list_del_init(&wdata->list); - memcpy(&tmp_from, &saved_from, - sizeof(struct iov_iter)); iov_iter_advance(&tmp_from, wdata->offset - iocb->ki_pos); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 13cf507d1423..7ab5be7944aa 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1951,7 +1951,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", full_path, inode, inode->i_count.counter, - dentry, dentry->d_time, jiffies); + dentry, cifs_get_time(dentry), jiffies); if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 062c2375549a..d031af8d3d4d 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -399,7 +399,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, io_parms.offset = 0; io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; - rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0); + rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf); CIFSSMBClose(xid, tcon, fid.netfid); return rc; } diff --git a/fs/compat.c b/fs/compat.c index be6e48b0a46c..bd064a2c3550 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -54,20 +54,6 @@ #include <asm/ioctls.h> #include "internal.h" -int compat_log = 1; - -int compat_printk(const char *fmt, ...) -{ - va_list ap; - int ret; - if (!compat_log) - return 0; - va_start(ap, fmt); - ret = vprintk(fmt, ap); - va_end(ap); - return ret; -} - /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -562,7 +548,7 @@ ssize_t compat_rw_copy_check_uvector(int type, goto out; ret = -EINVAL; - if (nr_segs > UIO_MAXIOV || nr_segs < 0) + if (nr_segs > UIO_MAXIOV) goto out; if (nr_segs > fast_segs) { ret = -ENOMEM; diff --git a/fs/configfs/file.c b/fs/configfs/file.c index c30cf49b69d2..2c6312db8516 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -333,6 +333,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf, if (bin_attr->cb_max_size && *ppos + count > bin_attr->cb_max_size) { len = -EFBIG; + goto out; } tbuf = vmalloc(*ppos + count); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 0f9961eede1e..ed115acb5dee 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,6 +11,7 @@ #include <linux/random.h> #include <linux/string.h> #include <linux/fscrypto.h> +#include <linux/mount.h> static int inode_has_encryption_context(struct inode *inode) { @@ -92,26 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct inode *inode, +int fscrypt_process_policy(struct file *filp, const struct fscrypt_policy *policy) { + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + if (policy->version != 0) return -EINVAL; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!inode_has_encryption_context(inode)) { - if (!inode->i_sb->s_cop->empty_dir) - return -EOPNOTSUPP; - if (!inode->i_sb->s_cop->empty_dir(inode)) - return -ENOTEMPTY; - return create_encryption_context_from_policy(inode, policy); + if (!S_ISDIR(inode->i_mode)) + ret = -EINVAL; + else if (!inode->i_sb->s_cop->empty_dir) + ret = -EOPNOTSUPP; + else if (!inode->i_sb->s_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = create_encryption_context_from_policy(inode, + policy); + } else if (!is_encryption_context_consistent_with_policy(inode, + policy)) { + printk(KERN_WARNING + "%s: Policy inconsistent with encryption context\n", + __func__); + ret = -EINVAL; } - if (is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; + mnt_drop_write_file(filp); + return ret; } EXPORT_SYMBOL(fscrypt_process_policy); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index d116453b0276..79a5941c2474 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -585,7 +585,8 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) */ void *devpts_get_priv(struct dentry *dentry) { - WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); + if (dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC) + return NULL; return dentry->d_fsdata; } diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index eea64912c9c0..466f7d60edc2 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -607,20 +607,54 @@ static const struct file_operations format2_fops; static const struct file_operations format3_fops; static const struct file_operations format4_fops; -static int table_open(struct inode *inode, struct file *file) +static int table_open1(struct inode *inode, struct file *file) { struct seq_file *seq; - int ret = -1; + int ret; - if (file->f_op == &format1_fops) - ret = seq_open(file, &format1_seq_ops); - else if (file->f_op == &format2_fops) - ret = seq_open(file, &format2_seq_ops); - else if (file->f_op == &format3_fops) - ret = seq_open(file, &format3_seq_ops); - else if (file->f_op == &format4_fops) - ret = seq_open(file, &format4_seq_ops); + ret = seq_open(file, &format1_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open2(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format2_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open3(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format3_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open4(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + ret = seq_open(file, &format4_seq_ops); if (ret) return ret; @@ -631,7 +665,7 @@ static int table_open(struct inode *inode, struct file *file) static const struct file_operations format1_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open1, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -639,7 +673,7 @@ static const struct file_operations format1_fops = { static const struct file_operations format2_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open2, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -647,7 +681,7 @@ static const struct file_operations format2_fops = { static const struct file_operations format3_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open3, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -655,7 +689,7 @@ static const struct file_operations format3_fops = { static const struct file_operations format4_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open4, .read = seq_read, .llseek = seq_lseek, .release = seq_release diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fdf9bee67a12..b27491e885c4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5466,8 +5466,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) sbi->s_want_extra_isize, iloc, handle); if (ret) { - ext4_set_inode_state(inode, - EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { ext4_warning(inode->i_sb, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 10686fd67fb4..1bb7df5e4536 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -776,7 +776,7 @@ resizefs_out: (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - return fscrypt_process_policy(inode, &policy); + return fscrypt_process_policy(filp, &policy); #else return -EOPNOTSUPP; #endif diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1c593aa0218e..3ec8708989ca 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2211,6 +2211,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, + ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2241,6 +2242,11 @@ static int ext4_check_descriptors(struct super_block *sb, grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u overlaps " + "superblock", i); + } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " @@ -2248,6 +2254,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u overlaps " + "superblock", i); + } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " @@ -2255,6 +2266,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_table = ext4_inode_table(sb, gdp); + if (inode_table == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u overlaps " + "superblock", i); + } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -3757,7 +3773,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { + if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ret = -EFSCORRUPTED; goto failed_mount2; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 39e9cfb1b371..2eb935ca5d9e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1353,15 +1353,19 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, size_t min_offs, free; int total_ino; void *base, *start, *end; - int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + int isize_diff; /* How much do we need to grow i_extra_isize */ down_write(&EXT4_I(inode)->xattr_sem); + /* + * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty + */ + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); retry: - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { - up_write(&EXT4_I(inode)->xattr_sem); - return 0; - } + isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + goto out; header = IHDR(inode, raw_inode); entry = IFIRST(header); @@ -1382,7 +1386,7 @@ retry: goto cleanup; free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= new_extra_isize) { + if (free >= isize_diff) { entry = IFIRST(header); ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + @@ -1390,8 +1394,7 @@ retry: (void *)header, total_ino, inode->i_sb->s_blocksize); EXT4_I(inode)->i_extra_isize = new_extra_isize; - error = 0; - goto cleanup; + goto out; } /* @@ -1414,7 +1417,7 @@ retry: end = bh->b_data + bh->b_size; min_offs = end - base; free = ext4_xattr_free_space(first, &min_offs, base, NULL); - if (free < new_extra_isize) { + if (free < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; @@ -1428,7 +1431,7 @@ retry: free = inode->i_sb->s_blocksize; } - while (new_extra_isize > 0) { + while (isize_diff > 0) { size_t offs, size, entry_size; struct ext4_xattr_entry *small_entry = NULL; struct ext4_xattr_info i = { @@ -1459,7 +1462,7 @@ retry: EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + EXT4_XATTR_LEN(last->e_name_len); if (total_size <= free && total_size < min_total_size) { - if (total_size < new_extra_isize) { + if (total_size < isize_diff) { small_entry = last; } else { entry = last; @@ -1514,22 +1517,22 @@ retry: error = ext4_xattr_ibody_set(handle, inode, &i, is); if (error) goto cleanup; + total_ino -= entry_size; entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) - shift_bytes = new_extra_isize; + if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) + shift_bytes = isize_diff; else - shift_bytes = entry_size + size; + shift_bytes = entry_size + EXT4_XATTR_SIZE(size); /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - shift_bytes, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, - (void *)header, total_ino - entry_size, - inode->i_sb->s_blocksize); + ext4_xattr_shift_entries(entry, -shift_bytes, + (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize + shift_bytes, + (void *)header, total_ino, inode->i_sb->s_blocksize); - extra_isize += shift_bytes; - new_extra_isize -= shift_bytes; - EXT4_I(inode)->i_extra_isize = extra_isize; + isize_diff -= shift_bytes; + EXT4_I(inode)->i_extra_isize += shift_bytes; + header = IHDR(inode, raw_inode); i.name = b_entry_name; i.value = buffer; @@ -1551,6 +1554,8 @@ retry: kfree(bs); } brelse(bh); +out: + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); up_write(&EXT4_I(inode)->xattr_sem); return 0; @@ -1562,6 +1567,10 @@ cleanup: kfree(is); kfree(bs); brelse(bh); + /* + * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode + * size expansion failed. + */ up_write(&EXT4_I(inode)->xattr_sem); return error; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 69dd3e6566e0..a92e783fa057 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -24,6 +24,7 @@ #define EXT4_XATTR_INDEX_SYSTEM 7 #define EXT4_XATTR_INDEX_RICHACL 8 #define EXT4_XATTR_INDEX_ENCRYPTION 9 +#define EXT4_XATTR_INDEX_HURD 10 /* Reserved for Hurd */ struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d64d2a515cb2..ccb401eebc11 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1699,11 +1699,11 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); set_page_dirty(page); - f2fs_put_page(page, 1); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); + f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 675fa79d86f6..14f5fe2b841e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -538,7 +538,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct percpu_rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ @@ -787,7 +787,7 @@ struct f2fs_sb_info { struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ - struct percpu_rw_semaphore cp_rwsem; /* blocking FS operations */ + struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ @@ -1074,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - percpu_down_read(&sbi->cp_rwsem); + down_read(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - percpu_up_read(&sbi->cp_rwsem); + up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - percpu_down_write(&sbi->cp_rwsem); + down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - percpu_up_write(&sbi->cp_rwsem); + up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 26ebda971d53..e4e17f167dfc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - int ret; if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - ret = mnt_want_write_file(filp); - if (ret) - return ret; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - ret = fscrypt_process_policy(inode, &policy); - mnt_drop_write_file(filp); - return ret; + return fscrypt_process_policy(filp, &policy); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) @@ -2086,15 +2079,19 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (unlikely(f2fs_readonly(src->i_sb))) return -EROFS; - if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode)) - return -EISDIR; + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) return -EOPNOTSUPP; inode_lock(src); - if (src != dst) - inode_lock(dst); + if (src != dst) { + if (!inode_trylock(dst)) { + ret = -EBUSY; + goto out; + } + } ret = -EINVAL; if (pos_in + len > src->i_size || pos_in + len < pos_in) @@ -2152,6 +2149,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, out_unlock: if (src != dst) inode_unlock(dst); +out: inode_unlock(src); return ret; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b2fa4b615925..67ed2192f926 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -206,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool need = false; - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) need = true; } - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return need; } @@ -223,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -237,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return need_update; } @@ -284,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = grab_nat_entry(nm_i, ni->nid); @@ -334,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); } int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -342,7 +342,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; - percpu_down_write(&nm_i->nat_tree_lock); + if (!down_write_trylock(&nm_i->nat_tree_lock)) + return 0; while (nr_shrink && !list_empty(&nm_i->nat_entries)) { struct nat_entry *ne; @@ -351,7 +352,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) __del_from_nat_cache(nm_i, ne); nr_shrink--; } - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -373,13 +374,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) ni->nid = nid; /* Check nat cache */ - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return; } @@ -403,11 +404,11 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); /* cache nat entry */ - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); } /* @@ -1512,7 +1513,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index = 0, end = ULONG_MAX; struct pagevec pvec; - int ret2 = 0, ret = 0; + int ret2, ret = 0; pagevec_init(&pvec, 0); @@ -1541,10 +1542,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) cond_resched(); } - if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) - ret2 = -ENOSPC; - if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) - ret2 = -EIO; + ret2 = filemap_check_errors(NODE_MAPPING(sbi)); if (!ret) ret = ret2; return ret; @@ -1788,7 +1786,7 @@ void build_free_nids(struct f2fs_sb_info *sbi) ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1820,7 +1818,7 @@ void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } up_read(&curseg->journal_rwsem); - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -2209,7 +2207,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); /* * if there are no enough space in journal to store dirty nat @@ -2232,7 +2230,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2268,8 +2266,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); - if (percpu_init_rwsem(&nm_i->nat_tree_lock)) - return -ENOMEM; + init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); @@ -2326,7 +2323,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; @@ -2351,9 +2348,8 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); - percpu_free_rwsem(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); sbi->nm_info = NULL; kfree(nm_i); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1b86d3f638ef..7f863a645ab1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -706,8 +706,6 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); - - percpu_free_rwsem(&sbi->cp_rwsem); } static void f2fs_put_super(struct super_block *sb) @@ -1483,9 +1481,6 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) { int i, err; - if (percpu_init_rwsem(&sbi->cp_rwsem)) - return -ENOMEM; - for (i = 0; i < NR_COUNT_TYPE; i++) { err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); if (err) @@ -1686,6 +1681,7 @@ try_onemore: sbi->write_io[i].bio = NULL; } + init_rwsem(&sbi->cp_rwsem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 92b7363dafa9..4afdc3f36470 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -21,6 +21,17 @@ #include <linux/namei.h> #include "fat.h" +static inline unsigned long vfat_d_version(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + +static inline void vfat_d_version_set(struct dentry *dentry, + unsigned long version) +{ + dentry->d_fsdata = (void *) version; +} + /* * If new entry was created in the parent, it could create the 8.3 * alias (the shortname of logname). So, the parent may have the @@ -33,7 +44,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) { int ret = 1; spin_lock(&dentry->d_lock); - if (dentry->d_time != d_inode(dentry->d_parent)->i_version) + if (vfat_d_version(dentry) != d_inode(dentry->d_parent)->i_version) ret = 0; spin_unlock(&dentry->d_lock); return ret; @@ -759,7 +770,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, out: mutex_unlock(&MSDOS_SB(sb)->s_lock); if (!inode) - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); return d_splice_alias(inode, dentry); error: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -823,7 +834,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -849,7 +860,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); diff --git a/fs/file.c b/fs/file.c index 6b1acdfe59da..69d6990e3021 100644 --- a/fs/file.c +++ b/fs/file.c @@ -23,12 +23,12 @@ #include <linux/rcupdate.h> #include <linux/workqueue.h> -int sysctl_nr_open __read_mostly = 1024*1024; -int sysctl_nr_open_min = BITS_PER_LONG; +unsigned int sysctl_nr_open __read_mostly = 1024*1024; +unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) -int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & - -BITS_PER_LONG; +unsigned int sysctl_nr_open_max = + __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void *alloc_fdmem(size_t size) { @@ -163,7 +163,7 @@ out: * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_fdtable(struct files_struct *files, int nr) +static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -208,7 +208,7 @@ static int expand_fdtable(struct files_struct *files, int nr) * expanded and execution may have blocked. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_files(struct files_struct *files, int nr) +static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -243,12 +243,12 @@ repeat: return expanded; } -static inline void __set_close_on_exec(int fd, struct fdtable *fdt) +static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->close_on_exec); } -static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) +static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); @@ -268,10 +268,10 @@ static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } -static int count_open_files(struct fdtable *fdt) +static unsigned int count_open_files(struct fdtable *fdt) { - int size = fdt->max_fds; - int i; + unsigned int size = fdt->max_fds; + unsigned int i; /* Find the last open fd */ for (i = size / BITS_PER_LONG; i > 0; ) { @@ -291,7 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, i; + unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -391,7 +391,7 @@ static struct fdtable *close_files(struct files_struct * files) * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); - int i, j = 0; + unsigned int i, j = 0; for (;;) { unsigned long set; @@ -477,11 +477,11 @@ struct files_struct init_files = { .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), }; -static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start) +static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { - unsigned long maxfd = fdt->max_fds; - unsigned long maxbit = maxfd / BITS_PER_LONG; - unsigned long bitbit = start / BITS_PER_LONG; + unsigned int maxfd = fdt->max_fds; + unsigned int maxbit = maxfd / BITS_PER_LONG; + unsigned int bitbit = start / BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit > maxfd) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4d09d4441e3e..05713a5da083 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1949,6 +1949,12 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; + /* + * If we are expecting writeback progress we must submit plugged IO. + */ + if (blk_needs_flush_plug(current)) + blk_schedule_flush_plug(current); + if (!nr_pages) nr_pages = get_nr_dirty_pages(); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4b9201b7c80b..118bcb192a7b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -530,13 +530,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, int write) +static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) { unsigned i; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; - if (write) + if (should_dirty) set_page_dirty_lock(page); put_page(page); } @@ -1320,6 +1320,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; + bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1363,7 +1364,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, !write); + fuse_release_user_pages(req, should_dirty); if (req->out.h.error) { err = req->out.h.error; break; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index d3bcdd975700..b3be1b5a62e2 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -189,6 +189,11 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, hpfs_get_block); } +static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block); +} + const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, @@ -214,4 +219,5 @@ const struct file_operations hpfs_file_ops = const struct inode_operations hpfs_file_iops = { .setattr = hpfs_setattr, + .fiemap = hpfs_fiemap, }; diff --git a/fs/inode.c b/fs/inode.c index 7e3ef3af3db9..a3c7ba7f6b59 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1021,13 +1021,17 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; - +again: spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data); spin_unlock(&inode_hash_lock); if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } return inode; } @@ -1064,6 +1068,10 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, destroy_inode(inode); inode = old; wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } } return inode; @@ -1091,12 +1099,16 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; - +again: spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } return inode; } @@ -1131,6 +1143,10 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) destroy_inode(inode); inode = old; wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } } return inode; } @@ -1266,10 +1282,16 @@ EXPORT_SYMBOL(ilookup5_nowait); struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct inode *inode = ilookup5_nowait(sb, hashval, test, data); - - if (inode) + struct inode *inode; +again: + inode = ilookup5_nowait(sb, hashval, test, data); + if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + } return inode; } EXPORT_SYMBOL(ilookup5); @@ -1286,13 +1308,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; - +again: spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); - if (inode) + if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + } return inode; } EXPORT_SYMBOL(ilookup); @@ -1536,16 +1563,36 @@ sector_t bmap(struct inode *inode, sector_t block) EXPORT_SYMBOL(bmap); /* + * Update times in overlayed inode from underlying real inode + */ +static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, + bool rcu) +{ + if (!rcu) { + struct inode *realinode = d_real_inode(dentry); + + if (unlikely(inode != realinode) && + (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || + !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) { + inode->i_mtime = realinode->i_mtime; + inode->i_ctime = realinode->i_ctime; + } + } +} + +/* * With relative atime, only update atime if the previous atime is * earlier than either the ctime or mtime or if at least a day has * passed since the last atime update. */ -static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, - struct timespec now) +static int relatime_need_update(const struct path *path, struct inode *inode, + struct timespec now, bool rcu) { - if (!(mnt->mnt_flags & MNT_RELATIME)) + if (!(path->mnt->mnt_flags & MNT_RELATIME)) return 1; + + update_ovl_inode_times(path->dentry, inode, rcu); /* * Is mtime younger than atime? If yes, update atime: */ @@ -1612,7 +1659,8 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -bool atime_needs_update(const struct path *path, struct inode *inode) +bool __atime_needs_update(const struct path *path, struct inode *inode, + bool rcu) { struct vfsmount *mnt = path->mnt; struct timespec now; @@ -1638,7 +1686,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode) now = current_fs_time(inode->i_sb); - if (!relatime_need_update(mnt, inode, now)) + if (!relatime_need_update(path, inode, now, rcu)) return false; if (timespec_equal(&inode->i_atime, &now)) @@ -1653,7 +1701,7 @@ void touch_atime(const struct path *path) struct inode *inode = d_inode(path->dentry); struct timespec now; - if (!atime_needs_update(path, inode)) + if (!__atime_needs_update(path, inode, false)) return; if (!sb_start_write_trylock(inode->i_sb)) diff --git a/fs/internal.h b/fs/internal.h index ba0737649d4a..18cfde9066a0 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -120,6 +120,15 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); +extern bool __atime_needs_update(const struct path *, struct inode *, bool); +static inline bool atime_needs_update_rcu(const struct path *path, + struct inode *inode) +{ + return __atime_needs_update(path, inode, true); +} + +extern bool atime_needs_update_rcu(const struct path *, struct inode *); + /* * fs-writeback.c */ @@ -156,7 +165,7 @@ extern void mnt_pin_kill(struct mount *m); /* * fs/nsfs.c */ -extern struct dentry_operations ns_dentry_operations; +extern const struct dentry_operations ns_dentry_operations; /* * fs/ioctl.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 0f56deb24ce6..c415668c86d4 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -568,7 +568,7 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static long ioctl_file_dedupe_range(struct file *file, void __user *arg) +static int ioctl_file_dedupe_range(struct file *file, void __user *arg) { struct file_dedupe_range __user *argp = arg; struct file_dedupe_range *same = NULL; @@ -582,6 +582,10 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg) } size = offsetof(struct file_dedupe_range __user, info[count]); + if (size > PAGE_SIZE) { + ret = -ENOMEM; + goto out; + } same = memdup_user(argp, size); if (IS_ERR(same)) { diff --git a/fs/iomap.c b/fs/iomap.c index 48141b8eff5f..706270f21b35 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -84,8 +84,11 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, * Now the data has been copied, commit the range we've copied. This * should not fail unless the filesystem has had a fatal error. */ - ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0, - flags, &iomap); + if (ops->iomap_end) { + ret = ops->iomap_end(inode, pos, length, + written > 0 ? written : 0, + flags, &iomap); + } return written ? written : ret; } @@ -194,12 +197,9 @@ again: if (mapping_writably_mapped(inode->i_mapping)) flush_dcache_page(page); - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - pagefault_enable(); flush_dcache_page(page); - mark_page_accessed(page); status = iomap_write_end(inode, pos, bytes, copied, page); if (unlikely(status < 0)) @@ -428,9 +428,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, break; } + if (iomap->flags & IOMAP_F_MERGED) + flags |= FIEMAP_EXTENT_MERGED; + return fiemap_fill_next_extent(fi, iomap->offset, iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, - iomap->length, flags | FIEMAP_EXTENT_MERGED); + iomap->length, flags); } @@ -470,13 +473,18 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, if (ret) return ret; - ret = filemap_write_and_wait(inode->i_mapping); - if (ret) - return ret; + if (fi->fi_flags & FIEMAP_FLAG_SYNC) { + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + return ret; + } while (len > 0) { ret = iomap_apply(inode, start, len, 0, ops, &ctx, iomap_fiemap_actor); + /* inode with no (attribute) mapping will give ENOENT */ + if (ret == -ENOENT) + break; if (ret < 0) return ret; if (ret == 0) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e1574008adc9..2bcb86e6e6ca 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -840,21 +840,35 @@ repeat: mutex_lock(&kernfs_mutex); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { + struct kernfs_node *parent; struct inode *inode; - struct dentry *dentry; + /* + * We want fsnotify_modify() on @kn but as the + * modifications aren't originating from userland don't + * have the matching @file available. Look up the inodes + * and generate the events manually. + */ inode = ilookup(info->sb, kn->ino); if (!inode) continue; - dentry = d_find_any_alias(inode); - if (dentry) { - fsnotify_parent(NULL, dentry, FS_MODIFY); - fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, - NULL, 0); - dput(dentry); + parent = kernfs_get_parent(kn); + if (parent) { + struct inode *p_inode; + + p_inode = ilookup(info->sb, parent->ino); + if (p_inode) { + fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, + inode, FSNOTIFY_EVENT_INODE, kn->name, 0); + iput(p_inode); + } + + kernfs_put(parent); } + fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, + kn->name, 0); iput(inode); } diff --git a/fs/locks.c b/fs/locks.c index ee1b15f6fc13..b242d5b99589 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -139,6 +139,11 @@ #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) +static inline bool is_remote_lock(struct file *filp) +{ + return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK)); +} + static bool lease_breaking(struct file_lock *fl) { return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); @@ -791,7 +796,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; struct file_lock_context *ctx; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { @@ -1192,7 +1197,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return posix_lock_inode(file_inode(filp), fl, conflock); + return posix_lock_inode(locks_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1232,7 +1237,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) int locks_mandatory_locked(struct file *file) { int ret; - struct inode *inode = file_inode(file); + struct inode *inode = locks_inode(file); struct file_lock_context *ctx; struct file_lock *fl; @@ -1572,7 +1577,7 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); @@ -1580,7 +1585,7 @@ int fcntl_getlease(struct file *filp) ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); - time_out_leases(file_inode(filp), &dispose); + time_out_leases(inode, &dispose); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file != filp) continue; @@ -1613,7 +1618,8 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + if ((arg == F_RDLCK) && + (atomic_read(&d_real_inode(dentry)->i_writecount) > 0)) return -EAGAIN; if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || @@ -1628,7 +1634,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr { struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = file_inode(filp); + struct inode *inode = dentry->d_inode; struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; @@ -1742,7 +1748,7 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1782,7 +1788,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -1830,7 +1836,7 @@ EXPORT_SYMBOL(generic_setlease); int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { - if (filp->f_op->setlease) + if (filp->f_op->setlease && is_remote_lock(filp)) return filp->f_op->setlease(filp, arg, lease, priv); else return generic_setlease(filp, arg, lease, priv); @@ -1979,7 +1985,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (error) goto out_free; - if (f.file->f_op->flock) + if (f.file->f_op->flock && is_remote_lock(f.file)) error = f.file->f_op->flock(f.file, (can_sleep) ? F_SETLKW : F_SETLK, lock); @@ -2005,7 +2011,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); return 0; @@ -2129,7 +2135,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, cmd, fl); else return posix_lock_file(filp, fl, conf); @@ -2191,7 +2197,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; - inode = file_inode(filp); + inode = locks_inode(filp); /* * This might block, so we do it before checking the inode. @@ -2343,7 +2349,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = file_inode(filp); + inode = locks_inode(filp); /* Don't allow mandatory locks on files that may be memory mapped * and shared. @@ -2426,6 +2432,7 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { int error; + struct inode *inode = locks_inode(filp); struct file_lock lock; struct file_lock_context *ctx; @@ -2434,7 +2441,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2452,7 +2459,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock); - trace_locks_remove_posix(file_inode(filp), &lock, error); + trace_locks_remove_posix(inode, &lock, error); } EXPORT_SYMBOL(locks_remove_posix); @@ -2469,12 +2476,12 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); if (list_empty(&flctx->flc_flock)) return; - if (filp->f_op->flock) + if (filp->f_op->flock && is_remote_lock(filp)) filp->f_op->flock(filp, F_SETLKW, &fl); else flock_lock_inode(inode, &fl); @@ -2508,7 +2515,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + ctx = smp_load_acquire(&locks_inode(filp)->i_flctx); if (!ctx) return; @@ -2552,7 +2559,7 @@ EXPORT_SYMBOL(posix_unblock_lock); */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } @@ -2580,7 +2587,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, fl_pid = fl->fl_pid; if (fl->fl_file != NULL) - inode = file_inode(fl->fl_file); + inode = locks_inode(fl->fl_file); seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { @@ -2682,7 +2689,7 @@ static void __show_fd_locks(struct seq_file *f, void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) { - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; int id = 0; diff --git a/fs/namei.c b/fs/namei.c index adb04146df09..4bbcae1ba58e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1015,7 +1015,7 @@ const char *get_link(struct nameidata *nd) if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); - } else if (atime_needs_update(&last->link, inode)) { + } else if (atime_needs_update_rcu(&last->link, inode)) { if (unlikely(unlazy_walk(nd, NULL, 0))) return ERR_PTR(-ECHILD); touch_atime(&last->link); diff --git a/fs/namespace.c b/fs/namespace.c index 7bb2cda3bfef..dcd9afe21e62 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2700,7 +2700,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME); + MS_STRICTATIME | MS_NOREMOTELOCK); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index f55a4e756047..217847679f0e 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -346,7 +346,7 @@ static void bl_write_cleanup(struct work_struct *work) PAGE_SIZE - 1) & (loff_t)PAGE_MASK; ext_tree_mark_written(bl, start >> SECTOR_SHIFT, - (end - start) >> SECTOR_SHIFT); + (end - start) >> SECTOR_SHIFT, end); } pnfs_ld_write_done(hdr); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 18e6fd0b9506..efc007f00742 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -141,6 +141,7 @@ struct pnfs_block_layout { struct rb_root bl_ext_ro; spinlock_t bl_ext_lock; /* Protects list manipulation */ bool bl_scsi_layout; + u64 bl_lwb; }; static inline struct pnfs_block_layout * @@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl, int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, sector_t end); int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, - sector_t len); + sector_t len, u64 lwb); bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, struct pnfs_block_extent *ret, bool rw); int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 992bcb19c11e..c85fbfd2d0d9 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, - sector_t len) + sector_t len, u64 lwb) { struct rb_root *root = &bl->bl_ext_rw; sector_t end = start + len; @@ -471,6 +471,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, } } out: + if (bl->bl_lwb < lwb) + bl->bl_lwb = lwb; spin_unlock(&bl->bl_ext_lock); __ext_put_deviceids(&tmp); @@ -518,7 +520,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) } static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, - size_t buffer_size, size_t *count) + size_t buffer_size, size_t *count, __u64 *lastbyte) { struct pnfs_block_extent *be; int ret = 0; @@ -542,6 +544,8 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, p = encode_block_extent(be, p); be->be_tag = EXTENT_COMMITTING; } + *lastbyte = bl->bl_lwb - 1; + bl->bl_lwb = 0; spin_unlock(&bl->bl_ext_lock); return ret; @@ -564,7 +568,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) arg->layoutupdate_pages = &arg->layoutupdate_page; retry: - ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten); if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a7f2e6e33305..52a28311e2a4 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -275,6 +275,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, err_socks: svc_rpcb_cleanup(serv, net); err_bind: + nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %p\n", ret, net); return ret; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c92a75e066a6..f953ef6b2f2e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -454,11 +454,8 @@ static bool referring_call_exists(struct nfs_client *clp, ((u32 *)&rclist->rcl_sessionid.data)[3], ref->rc_sequenceid, ref->rc_slotid); - spin_lock(&tbl->slot_tbl_lock); - status = (test_bit(ref->rc_slotid, tbl->used_slots) && - tbl->slots[ref->rc_slotid].seq_nr == - ref->rc_sequenceid); - spin_unlock(&tbl->slot_tbl_lock); + status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid, + ref->rc_sequenceid, HZ >> 1) < 0; if (status) goto out; } @@ -487,7 +484,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, goto out; tbl = &clp->cl_session->bc_slot_table; - slot = tbl->slots + args->csa_slotid; /* Set up res before grabbing the spinlock */ memcpy(&res->csr_sessionid, &args->csa_sessionid, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 003ebce4bbc4..1e106780a237 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -426,7 +426,7 @@ EXPORT_SYMBOL_GPL(nfs_mark_client_ready); * Initialise the timeout values for a connection */ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, - unsigned int timeo, unsigned int retrans) + int timeo, int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; @@ -434,9 +434,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: - if (to->to_retries == 0) + if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (to->to_initval == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; @@ -449,9 +449,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: - if (to->to_retries == 0) + if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; - if (!to->to_initval) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7d620970f2e1..ca699ddc11c1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -657,7 +657,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result <= 0) goto out; - written = generic_write_sync(iocb, result); + result = generic_write_sync(iocb, result); + if (result < 0) + goto out; + written = result; iocb->ki_pos += written; /* Return error values */ diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index e6206eaf2bdf..51b51369704c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -37,6 +37,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) if (ffl) { INIT_LIST_HEAD(&ffl->error_list); INIT_LIST_HEAD(&ffl->mirrors); + ffl->last_report_time = ktime_get(); return &ffl->generic_hdr; } else return NULL; @@ -640,19 +641,18 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, { static const ktime_t notime = {0}; s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL; + struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); if (ktime_equal(mirror->start_time, notime)) mirror->start_time = now; - if (ktime_equal(mirror->last_report_time, notime)) - mirror->last_report_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; else if (layoutstats_timer != 0) report_interval = (s64)layoutstats_timer * 1000LL; - if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= + if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >= report_interval) { - mirror->last_report_time = now; + ffl->last_report_time = now; return true; } @@ -806,11 +806,14 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_pnfs_ds *ds; + bool fail_return = false; int idx; /* mirrors are sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { - ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); + if (idx+1 == fls->mirror_array_cnt) + fail_return = true; + ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return); if (ds) { *best_idx = idx; return ds; @@ -859,6 +862,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs4_pnfs_ds *ds; int ds_idx; +retry: /* Use full layout for now */ if (!pgio->pg_lseg) ff_layout_pg_get_read(pgio, req, false); @@ -871,10 +875,13 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); if (!ds) { - if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) - goto out_pnfs; - else + if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; } mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); @@ -890,12 +897,6 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_read_mds(pgio); - return; - -out_pnfs: - pnfs_set_lo_fail(pgio->pg_lseg); - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; } static void @@ -909,6 +910,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int i; int status; +retry: if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -940,10 +942,13 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, for (i = 0; i < pgio->pg_mirror_count; i++) { ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); if (!ds) { - if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) - goto out_pnfs; - else + if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; } pgm = &pgio->pg_mirrors[i]; mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); @@ -956,12 +961,6 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); - return; - -out_pnfs: - pnfs_set_lo_fail(pgio->pg_lseg); - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; } static unsigned int diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 1bcdb15d0c41..3ee0c9fcea76 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -84,7 +84,6 @@ struct nfs4_ff_layout_mirror { struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; - ktime_t last_report_time; u32 report_interval; }; @@ -101,6 +100,7 @@ struct nfs4_flexfile_layout { struct pnfs_ds_commit_info commit_info; struct list_head mirrors; struct list_head error_list; /* nfs4_ff_layout_ds_err */ + ktime_t last_report_time; /* Layoutstat report times */ }; static inline struct nfs4_flexfile_layout * diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 0aa36be71fce..f7a3f6b05369 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -17,8 +17,8 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD -static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; -static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; +static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS; +static unsigned int dataserver_retrans; void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { @@ -379,7 +379,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, devid = &mirror->mirror_ds->id_node; if (ff_layout_test_devid_unavailable(devid)) - goto out; + goto out_fail; ds = mirror->mirror_ds->ds; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -405,15 +405,16 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].rsize = max_payload; if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) mirror->mirror_ds->ds_versions[0].wsize = max_payload; - } else { - ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, lseg->pls_range.offset, - lseg->pls_range.length, NFS4ERR_NXIO, - OP_ILLEGAL, GFP_NOIO); - if (fail_return || !ff_layout_has_available_ds(lseg)) - pnfs_error_mark_layout_for_return(ino, lseg); - ds = NULL; + goto out; } + ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, lseg->pls_range.offset, + lseg->pls_range.length, NFS4ERR_NXIO, + OP_ILLEGAL, GFP_NOIO); +out_fail: + if (fail_return || !ff_layout_has_available_ds(lseg)) + pnfs_error_mark_layout_for_return(ino, lseg); + ds = NULL; out: return ds; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7ce5e023c3c3..74935a19e4bf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -58,6 +58,9 @@ struct nfs_clone_mount { */ #define NFS_UNSPEC_PORT (-1) +#define NFS_UNSPEC_RETRANS (UINT_MAX) +#define NFS_UNSPEC_TIMEO (UINT_MAX) + /* * Maximum number of pages that readdir can use for creating * a vmapped array of pages. @@ -156,7 +159,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); -void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); +void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); struct nfs_server *nfs_alloc_server(void); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 33da841a21bb..64b43b4ad9dd 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -318,10 +318,22 @@ static void nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) { struct nfs42_layoutstat_data *data = calldata; - struct nfs_server *server = NFS_SERVER(data->args.inode); + struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo; + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + rpc_exit(task, 0); + return; + } + nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid); + spin_unlock(&inode->i_lock); nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, &data->res.seq_res, task); + } static void @@ -338,12 +350,14 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case 0: break; case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match(&data->args.stateid, + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match(&data->args.stateid, &lo->plh_stateid)) { LIST_HEAD(head); @@ -357,11 +371,23 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) } else spin_unlock(&inode->i_lock); break; + case -NFS4ERR_OLD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&data->args.stateid, + &lo->plh_stateid)) { + /* Do we need to delay before resending? */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, + &data->args.stateid)) + rpc_delay(task, HZ); + rpc_restart_call_prepare(task); + } + spin_unlock(&inode->i_lock); + break; case -ENOTSUPP: case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; - default: - break; } dprintk("%s server returns %d\n", __func__, task->tk_status); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 324bfdc21250..9bf64eacba5b 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -396,6 +396,10 @@ extern void nfs4_schedule_state_renewal(struct nfs_client *); extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); +extern void nfs4_set_lease_period(struct nfs_client *clp, + unsigned long lease, + unsigned long lastrenewed); + /* nfs4state.c */ struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8d7d08d4f95f..cd3b7cfdde16 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -817,6 +817,11 @@ static int nfs4_set_client(struct nfs_server *server, goto error; } + if (server->nfs_client == clp) { + error = -ELOOP; + goto error; + } + /* * Query for the lease time on clientid setup or renewal * diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a036e93bdf96..a9dec32ba9ba 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -634,15 +634,11 @@ out_sleep: } EXPORT_SYMBOL_GPL(nfs40_setup_sequence); -static int nfs40_sequence_done(struct rpc_task *task, - struct nfs4_sequence_res *res) +static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot *slot = res->sr_slot; struct nfs4_slot_table *tbl; - if (slot == NULL) - goto out; - tbl = slot->table; spin_lock(&tbl->slot_tbl_lock); if (!nfs41_wake_and_assign_slot(tbl, slot)) @@ -650,7 +646,13 @@ static int nfs40_sequence_done(struct rpc_task *task, spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; -out: +} + +static int nfs40_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) + nfs40_sequence_free_slot(res); return 1; } @@ -666,6 +668,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) tbl = slot->table; session = tbl->session; + /* Bump the slot sequence number */ + if (slot->seq_done) + slot->seq_nr++; + slot->seq_done = 0; + spin_lock(&tbl->slot_tbl_lock); /* Be nice to the server: try to ensure that the last transmitted * value for highest_user_slotid <= target_highest_slotid @@ -686,9 +693,12 @@ out_unlock: res->sr_slot = NULL; if (send_new_highest_used_slotid) nfs41_notify_server(session->clp); + if (waitqueue_active(&tbl->slot_waitq)) + wake_up_all(&tbl->slot_waitq); } -int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +static int nfs41_sequence_process(struct rpc_task *task, + struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; @@ -714,7 +724,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) switch (res->sr_status) { case 0: /* Update the slot's sequence and clientid lease timer */ - ++slot->seq_nr; + slot->seq_done = 1; clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ @@ -769,16 +779,16 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) goto retry_nowait; default: /* Just update the slot sequence no. */ - ++slot->seq_nr; + slot->seq_done = 1; } out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); - nfs41_sequence_free_slot(res); out_noaction: return ret; retry_nowait: if (rpc_restart_call_prepare(task)) { + nfs41_sequence_free_slot(res); task->tk_status = 0; ret = 0; } @@ -789,8 +799,37 @@ out_retry: rpc_delay(task, NFS4_POLL_RETRY_MAX); return 0; } + +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + if (!nfs41_sequence_process(task, res)) + return 0; + if (res->sr_slot != NULL) + nfs41_sequence_free_slot(res); + return 1; + +} EXPORT_SYMBOL_GPL(nfs41_sequence_done); +static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + if (res->sr_slot == NULL) + return 1; + if (res->sr_slot->table->session != NULL) + return nfs41_sequence_process(task, res); + return nfs40_sequence_done(task, res); +} + +static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) { + if (res->sr_slot->table->session != NULL) + nfs41_sequence_free_slot(res); + else + nfs40_sequence_free_slot(res); + } +} + int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { if (res->sr_slot == NULL) @@ -920,6 +959,17 @@ static int nfs4_setup_sequence(const struct nfs_server *server, args, res, task); } +static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + return nfs40_sequence_done(task, res); +} + +static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) + nfs40_sequence_free_slot(res); +} + int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -1197,6 +1247,7 @@ static void nfs4_opendata_free(struct kref *kref) struct super_block *sb = p->dentry->d_sb; nfs_free_seqid(p->o_arg.seqid); + nfs4_sequence_free_slot(&p->o_res.seq_res); if (p->state != NULL) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); @@ -1656,9 +1707,14 @@ err: static struct nfs4_state * nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) { + struct nfs4_state *ret; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) - return _nfs4_opendata_reclaim_to_nfs4_state(data); - return _nfs4_opendata_to_nfs4_state(data); + ret =_nfs4_opendata_reclaim_to_nfs4_state(data); + else + ret = _nfs4_opendata_to_nfs4_state(data); + nfs4_sequence_free_slot(&data->o_res.seq_res); + return ret; } static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) @@ -2056,7 +2112,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; - if (!nfs4_sequence_done(task, &data->o_res.seq_res)) + if (!nfs4_sequence_process(task, &data->o_res.seq_res)) return; if (task->tk_status == 0) { @@ -4237,12 +4293,9 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str err = _nfs4_do_fsinfo(server, fhandle, fsinfo); trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err); if (err == 0) { - struct nfs_client *clp = server->nfs_client; - - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo->lease_time * HZ; - clp->cl_last_renewal = now; - spin_unlock(&clp->cl_lock); + nfs4_set_lease_period(server->nfs_client, + fsinfo->lease_time * HZ, + now); break; } err = nfs4_handle_exception(server, err, &exception); @@ -7517,12 +7570,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_create_session(clp, status); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EACCES: + case -EAGAIN: + goto out; + }; + + clp->cl_seqid++; if (!status) { /* Verify the session's negotiated channel_attrs values */ status = nfs4_verify_channel_attrs(&args, &res); /* Increment the clientid slot sequence id */ - if (clp->cl_seqid == res.seqid) - clp->cl_seqid++; if (status) goto out; nfs4_update_session(session, &res); @@ -7867,7 +7928,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; dprintk("--> %s\n", __func__); - nfs41_sequence_done(task, &lgp->res.seq_res); + nfs41_sequence_process(task, &lgp->res.seq_res); dprintk("<-- %s\n", __func__); } @@ -8083,6 +8144,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); + nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) @@ -8109,7 +8171,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); - if (!nfs41_sequence_done(task, &lrp->res.seq_res)) + if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; server = NFS_SERVER(lrp->args.inode); @@ -8121,6 +8183,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) case -NFS4ERR_DELAY: if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) break; + nfs4_sequence_free_slot(&lrp->res.seq_res); rpc_restart_call_prepare(task); return; } @@ -8135,12 +8198,16 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, - be32_to_cpu(lrp->args.stateid.seqid)); - if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) + if (lrp->res.lrs_present) { + pnfs_mark_matching_lsegs_invalid(lo, &freeme, + &lrp->args.range, + be32_to_cpu(lrp->args.stateid.seqid)); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + } else + pnfs_mark_layout_stateid_invalid(lo, &freeme); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); + nfs4_sequence_free_slot(&lrp->res.seq_res); pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index e1ba58c3d1ad..82e77198d17e 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -136,6 +136,26 @@ nfs4_kill_renewd(struct nfs_client *clp) cancel_delayed_work_sync(&clp->cl_renewd); } +/** + * nfs4_set_lease_period - Sets the lease period on a nfs_client + * + * @clp: pointer to nfs_client + * @lease: new value for lease period + * @lastrenewed: time at which lease was last renewed + */ +void nfs4_set_lease_period(struct nfs_client *clp, + unsigned long lease, + unsigned long lastrenewed) +{ + spin_lock(&clp->cl_lock); + clp->cl_lease_time = lease; + clp->cl_last_renewal = lastrenewed; + spin_unlock(&clp->cl_lock); + + /* Cap maximum reconnect timeout at 1/2 lease period */ + rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1); +} + /* * Local variables: * c-basic-offset: 8 diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 332d06e64fa9..b62973045a3e 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -28,6 +28,7 @@ static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue) tbl->highest_used_slotid = NFS4_NO_SLOT; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue); + init_waitqueue_head(&tbl->slot_waitq); init_completion(&tbl->complete); } @@ -172,6 +173,58 @@ struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid) return ERR_PTR(-E2BIG); } +static int nfs4_slot_get_seqid(struct nfs4_slot_table *tbl, u32 slotid, + u32 *seq_nr) + __must_hold(&tbl->slot_tbl_lock) +{ + struct nfs4_slot *slot; + + slot = nfs4_lookup_slot(tbl, slotid); + if (IS_ERR(slot)) + return PTR_ERR(slot); + *seq_nr = slot->seq_nr; + return 0; +} + +/* + * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use + * + * Given a slot table, slot id and sequence number, determine if the + * RPC call in question is still in flight. This function is mainly + * intended for use by the callback channel. + */ +static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr) +{ + u32 cur_seq; + bool ret = false; + + spin_lock(&tbl->slot_tbl_lock); + if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 && + cur_seq == seq_nr && test_bit(slotid, tbl->used_slots)) + ret = true; + spin_unlock(&tbl->slot_tbl_lock); + return ret; +} + +/* + * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete + * + * Given a slot table, slot id and sequence number, wait until the + * corresponding RPC call completes. This function is mainly + * intended for use by the callback channel. + */ +int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr, + unsigned long timeout) +{ + if (wait_event_timeout(tbl->slot_waitq, + !nfs4_slot_seqid_in_use(tbl, slotid, seq_nr), + timeout) == 0) + return -ETIMEDOUT; + return 0; +} + /* * nfs4_alloc_slot - efficiently look for a free slot * diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 5b51298d1d03..f703b755351b 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -21,7 +21,8 @@ struct nfs4_slot { unsigned long generation; u32 slot_nr; u32 seq_nr; - unsigned int interrupted : 1; + unsigned int interrupted : 1, + seq_done : 1; }; /* Sessions */ @@ -36,6 +37,7 @@ struct nfs4_slot_table { unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ spinlock_t slot_tbl_lock; struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ + wait_queue_head_t slot_waitq; /* Completion wait on slot */ u32 max_slots; /* # slots in table */ u32 max_slotid; /* Max allowed slotid value */ u32 highest_used_slotid; /* sent to server on each SEQ. @@ -78,6 +80,9 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid); +extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr, + unsigned long timeout); extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 834b875900d6..cada00aa5096 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -277,20 +277,17 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) { int status; struct nfs_fsinfo fsinfo; + unsigned long now; if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { nfs4_schedule_state_renewal(clp); return 0; } + now = jiffies; status = nfs4_proc_get_lease_time(clp, &fsinfo); if (status == 0) { - /* Update lease time and schedule renewal */ - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = jiffies; - spin_unlock(&clp->cl_lock); - + nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now); nfs4_schedule_state_renewal(clp); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 70806cae0d36..2c93a85eda51 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -365,7 +365,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); if (list_empty(&lo->plh_segs)) { - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + if (atomic_read(&lo->plh_outstanding) == 0) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); @@ -768,17 +769,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { u32 oldseq, newseq, new_barrier = 0; - bool invalid = !pnfs_layout_is_valid(lo); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { + + if (!pnfs_layout_is_valid(lo)) { + nfs4_stateid_copy(&lo->plh_stateid, new); + lo->plh_barrier = newseq; + pnfs_clear_layoutreturn_info(lo); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + return; + } + if (pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); /* * Because of wraparound, we want to keep the barrier @@ -790,7 +806,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, new_barrier = be32_to_cpu(new->seqid); else if (new_barrier == 0) return; - if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) lo->plh_barrier = new_barrier; } @@ -886,19 +902,14 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } -static void -pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) -{ - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); -} - static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, enum pnfs_iomode *iomode) { + /* Serialise LAYOUTGET/LAYOUTRETURN */ + if (atomic_read(&lo->plh_outstanding) != 0) + return false; if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; pnfs_get_layout_hdr(lo); @@ -1555,6 +1566,7 @@ pnfs_update_layout(struct inode *ino, } lookup_again: + nfs4_client_recover_expired_lease(clp); first = false; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); @@ -1797,16 +1809,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) */ pnfs_mark_layout_stateid_invalid(lo, &free_me); - nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); - lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + pnfs_set_layout_stateid(lo, &res->stateid, true); } pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg, &free_me); - if (!pnfs_layout_is_valid(lo)) { - pnfs_clear_layoutreturn_info(lo); - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - } if (res->return_on_close) @@ -2510,7 +2517,6 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) data->args.fh = NFS_FH(inode); data->args.inode = inode; - nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid); status = ld->prepare_layoutstats(&data->args); if (status) goto out_free; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 18d446e1a82b..d39601381adf 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -923,6 +923,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { + data->timeo = NFS_UNSPEC_TIMEO; + data->retrans = NFS_UNSPEC_RETRANS; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; @@ -1189,6 +1191,19 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option) return rc; } +static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option, + unsigned long l_bound, unsigned long u_bound) +{ + int ret; + + ret = nfs_get_option_ul(args, option); + if (ret != 0) + return ret; + if (*option < l_bound || *option > u_bound) + return -ERANGE; + return 0; +} + /* * Error-check and convert a string of mount options from user space into * a data structure. The whole mount string is processed; bad options are @@ -1352,12 +1367,12 @@ static int nfs_parse_mount_options(char *raw, mnt->bsize = option; break; case Opt_timeo: - if (nfs_get_option_ul(args, &option) || option == 0) + if (nfs_get_option_ul_bound(args, &option, 1, INT_MAX)) goto out_invalid_value; mnt->timeo = option; break; case Opt_retrans: - if (nfs_get_option_ul(args, &option) || option == 0) + if (nfs_get_option_ul_bound(args, &option, 0, INT_MAX)) goto out_invalid_value; mnt->retrans = option; break; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8410ca275db1..a204d7e109d4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4903,6 +4903,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfs_ok; } +static __be32 +nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) +{ + struct nfs4_ol_stateid *stp = openlockstateid(s); + __be32 ret; + + mutex_lock(&stp->st_mutex); + + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + goto out; + + ret = nfserr_locks_held; + if (check_for_locks(stp->st_stid.sc_file, + lockowner(stp->st_stateowner))) + goto out; + + release_lock_stateid(stp); + ret = nfs_ok; + +out: + mutex_unlock(&stp->st_mutex); + nfs4_put_stid(s); + return ret; +} + __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *free_stateid) @@ -4910,7 +4936,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; - struct nfs4_ol_stateid *stp; struct nfs4_client *cl = cstate->session->se_client; __be32 ret = nfserr_bad_stateid; @@ -4929,18 +4954,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfserr_locks_held; break; case NFS4_LOCK_STID: - ret = check_stateid_generation(stateid, &s->sc_stateid, 1); - if (ret) - break; - stp = openlockstateid(s); - ret = nfserr_locks_held; - if (check_for_locks(stp->st_stid.sc_file, - lockowner(stp->st_stateowner))) - break; - WARN_ON(!unhash_lock_stateid(stp)); + atomic_inc(&s->sc_count); spin_unlock(&cl->cl_lock); - nfs4_put_stid(s); - ret = nfs_ok; + ret = nfsd4_free_lock_stateid(stateid, s); goto out; case NFS4_REVOKED_DELEG_STID: dp = delegstateid(s); @@ -5507,7 +5523,7 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, - struct nfs4_ol_stateid **lst, bool *new) + struct nfs4_ol_stateid **plst, bool *new) { __be32 status; struct nfs4_file *fi = ost->st_stid.sc_file; @@ -5515,7 +5531,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_client *cl = oo->oo_owner.so_client; struct inode *inode = d_inode(cstate->current_fh.fh_dentry); struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *lst; unsigned int strhashval; + bool hashed; lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { @@ -5531,12 +5549,27 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, goto out; } - *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); - if (*lst == NULL) { +retry: + lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); + if (lst == NULL) { status = nfserr_jukebox; goto out; } + + mutex_lock(&lst->st_mutex); + + /* See if it's still hashed to avoid race with FREE_STATEID */ + spin_lock(&cl->cl_lock); + hashed = !list_empty(&lst->st_perfile); + spin_unlock(&cl->cl_lock); + + if (!hashed) { + mutex_unlock(&lst->st_mutex); + nfs4_put_stid(&lst->st_stid); + goto retry; + } status = nfs_ok; + *plst = lst; out: nfs4_put_stateowner(&lo->lo_owner); return status; @@ -5603,8 +5636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); - if (status == nfs_ok) - mutex_lock(&lock_stp->st_mutex); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ba944123167b..ff476e654b8f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1252,10 +1252,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dchild)) return nfserrno(host_err); err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - if (err) { - dput(dchild); + /* + * We unconditionally drop our ref to dchild as fh_compose will have + * already grabbed its own ref for it. + */ + dput(dchild); + if (err) return err; - } return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type, rdev, resfhp); } diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d2f97ecca6a5..e0e5f7c3c99f 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response || - atomic_read(&group->fanotify_data.bypass_perm)); - - if (!event->response) { /* bypass_perm set */ - /* - * Event was canceled because group is being destroyed. Remove - * it from group's event list because we are responsible for - * freeing the permission event. - */ - fsnotify_remove_event(group, &event->fae.fse); - return 0; - } + wait_event(group->fanotify_data.access_waitq, event->response); /* userspace responded, convert to something usable */ switch (event->response) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8e8e6bcd1d43..a64313868d3a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + struct fsnotify_event *fsn_event; /* - * There may be still new events arriving in the notification queue - * but since userspace cannot use fanotify fd anymore, no event can - * enter or leave access_list by now. + * Stop new events from arriving in the notification queue. since + * userspace cannot use fanotify fd anymore, no event can enter or + * leave access_list by now either. */ - spin_lock(&group->fanotify_data.access_lock); - - atomic_inc(&group->fanotify_data.bypass_perm); + fsnotify_group_stop_queueing(group); + /* + * Process all permission events on access_list and notification queue + * and simulate reply from userspace. + */ + spin_lock(&group->fanotify_data.access_lock); list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, fae.fse.list) { pr_debug("%s: found group=%p event=%p\n", __func__, group, @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file) spin_unlock(&group->fanotify_data.access_lock); /* - * Since bypass_perm is set, newly queued events will not wait for - * access response. Wake up the already sleeping ones now. - * synchronize_srcu() in fsnotify_destroy_group() will wait for all - * processes sleeping in fanotify_handle_event() waiting for access - * response and thus also for all permission events to be freed. + * Destroy all non-permission events. For permission events just + * dequeue them and set the response. They will be freed once the + * response is consumed and fanotify_get_response() returns. */ + mutex_lock(&group->notification_mutex); + while (!fsnotify_notify_queue_is_empty(group)) { + fsn_event = fsnotify_remove_first_event(group); + if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, fsn_event); + else + FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; + } + mutex_unlock(&group->notification_mutex); + + /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); #endif @@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - atomic_set(&group->fanotify_data.bypass_perm, 0); #endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: diff --git a/fs/notify/group.c b/fs/notify/group.c index 3e2dd85be5dd..b47f7cfdcaa4 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -40,6 +40,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) } /* + * Stop queueing new events for this group. Once this function returns + * fsnotify_add_event() will not add any new events to the group's queue. + */ +void fsnotify_group_stop_queueing(struct fsnotify_group *group) +{ + mutex_lock(&group->notification_mutex); + group->shutdown = true; + mutex_unlock(&group->notification_mutex); +} + +/* * Trying to get rid of a group. Remove all marks, flush all events and release * the group reference. * Note that another thread calling fsnotify_clear_marks_by_group() may still @@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_destroy_group(struct fsnotify_group *group) { + /* + * Stop queueing new events. The code below is careful enough to not + * require this but fanotify needs to stop queuing events even before + * fsnotify_destroy_group() is called and this makes the other callers + * of fsnotify_destroy_group() to see the same behavior. + */ + fsnotify_group_stop_queueing(group); + /* clear all inode marks for this group, attach them to destroy_list */ fsnotify_detach_group_marks(group); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index a95d8e037aeb..e455e83ceeeb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * Add an event to the group notification queue. The group can later pull this * event off the queue to deal with. The function returns 0 if the event was * added to the queue, 1 if the event was merged with some other queued event, - * 2 if the queue of events has overflown. + * 2 if the event was not queued - either the queue of events has overflown + * or the group is shutting down. */ int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, @@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group, mutex_lock(&group->notification_mutex); + if (group->shutdown) { + mutex_unlock(&group->notification_mutex); + return 2; + } + if (group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ @@ -126,21 +132,6 @@ queue: } /* - * Remove @event from group's notification queue. It is the responsibility of - * the caller to destroy the event. - */ -void fsnotify_remove_event(struct fsnotify_group *group, - struct fsnotify_event *event) -{ - mutex_lock(&group->notification_mutex); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - group->q_len--; - } - mutex_unlock(&group->notification_mutex); -} - -/* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f548629dfaac..bf72a2c58b75 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1850,7 +1850,7 @@ again: * pages being swapped out between us bringing them into memory * and doing the actual copying. */ - if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) { + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; break; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 7dabbc31060e..f165f867f332 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5922,7 +5922,6 @@ bail: } static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, - handle_t *handle, struct inode *data_alloc_inode, struct buffer_head *data_alloc_bh) { @@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, struct ocfs2_truncate_log *tl; struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; + handle_t *handle; di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; i = le16_to_cpu(tl->tl_used) - 1; while (i >= 0) { + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail; + } + /* Caller has given us at least enough credits to * update the truncate log dinode */ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, @@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, } } - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_commit_trans(osb, handle); i--; } @@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; unsigned int num_to_flush; - handle_t *handle; struct inode *tl_inode = osb->osb_tl_inode; struct inode *data_alloc_inode = NULL; struct buffer_head *tl_bh = osb->osb_tl_bh; @@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_unlock; - } - - status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + status = ocfs2_replay_truncate_records(osb, data_alloc_inode, data_alloc_bh); if (status < 0) mlog_errno(status); - ocfs2_commit_trans(osb, handle); - -out_unlock: brelse(data_alloc_bh); ocfs2_inode_unlock(data_alloc_inode, 1); @@ -6413,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_unlock; - } - while (head) { if (head->free_bg) bg_blkno = head->free_bg; else bg_blkno = ocfs2_which_suballoc_group(head->free_blk, head->free_bit); + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + trace_ocfs2_free_cached_blocks( (unsigned long long)head->free_blk, head->free_bit); ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, head->free_bit, bg_blkno, 1); - if (ret) { + if (ret) mlog_errno(ret); - goto out_journal; - } - ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); - if (ret) { - mlog_errno(ret); - goto out_journal; - } + ocfs2_commit_trans(osb, handle); tmp = head; head = head->free_next; kfree(tmp); } -out_journal: - ocfs2_commit_trans(osb, handle); - out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 94b18369b1cc..b95e7df5b76a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,9 +44,6 @@ * version here in tcp_internal.h should not need to be bumped for * filesystem locking changes. * - * New in version 12 - * - Negotiate hb timeout when storage is down. - * * New in version 11 * - Negotiation of filesystem locking in the dlm join. * @@ -78,7 +75,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 12ULL +#define O2NET_PROTOCOL_VERSION 11ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index cdeafb4e7ed6..0bb128659d4b 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags, int type) { enum dlm_status status; - u8 old_owner = res->owner; mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); @@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; - lock->convert_pending = 0; /* if it failed, move it back to granted queue. * if master returns DLM_NORMAL and then down before sending ast, * it may have already been moved to granted queue, reset to @@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, if (status != DLM_NOTQUEUED) dlm_error(status); dlm_revert_pending_convert(res, lock); - } else if ((res->state & DLM_LOCK_RES_RECOVERING) || - (old_owner != res->owner)) { - mlog(0, "res %.*s is in recovering or has been recovered.\n", - res->lockname.len, res->lockname.name); + } else if (!lock->convert_pending) { + mlog(0, "%s: res %.*s, owner died and lock has been moved back " + "to granted list, retry convert.\n", + dlm->name, res->lockname.len, res->lockname.name); status = DLM_RECOVERING; } + + lock->convert_pending = 0; bail: spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1ab3657242e8..8a7eb81dcdfc 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, u64 start, u64 len) { int ret = 0; - u64 tmpend, end = start + len; + u64 tmpend = 0; + u64 end = start + len; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int csize = osb->s_clustersize; handle_t *handle; @@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, } /* - * We want to get the byte offset of the end of the 1st cluster. + * If start is on a cluster boundary and end is somewhere in another + * cluster, we have not COWed the cluster starting at start, unless + * end is also within the same cluster. So, in this case, we skip this + * first call to ocfs2_zero_range_for_truncate() truncate and move on + * to the next one. */ - tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); - if (tmpend > end) - tmpend = end; + if ((start & (csize - 1)) != 0) { + /* + * We want to get the byte offset of the end of the 1st + * cluster. + */ + tmpend = (u64)osb->s_clustersize + + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; - trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, - (unsigned long long)tmpend); + trace_ocfs2_zero_partial_clusters_range1( + (unsigned long long)start, + (unsigned long long)tmpend); - ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); - if (ret) - mlog_errno(ret); + ret = ocfs2_zero_range_for_truncate(inode, handle, start, + tmpend); + if (ret) + mlog_errno(ret); + } if (tmpend < end) { /* diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index ea47120a85ff..6ad3533940ba 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1199,14 +1199,24 @@ retry: inode_unlock((*ac)->ac_inode); ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); - if (ret == 1) + if (ret == 1) { + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; goto retry; + } if (ret < 0) mlog_errno(ret); inode_lock((*ac)->ac_inode); - ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + inode_unlock((*ac)->ac_inode); + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; + goto bail; + } } if (status < 0) { if (status != -ENOSPC) diff --git a/fs/open.c b/fs/open.c index 4fd6e256f4f4..8aeb08bb278b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -68,6 +68,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, long vfs_truncate(const struct path *path, loff_t length) { struct inode *inode; + struct dentry *upperdentry; long error; inode = path->dentry->d_inode; @@ -90,7 +91,17 @@ long vfs_truncate(const struct path *path, loff_t length) if (IS_APPEND(inode)) goto mnt_drop_write_and_out; - error = get_write_access(inode); + /* + * If this is an overlayfs then do as if opening the file so we get + * write access on the upper inode, not on the overlay inode. For + * non-overlay filesystems d_real() is an identity function. + */ + upperdentry = d_real(path->dentry, NULL, O_WRONLY); + error = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto mnt_drop_write_and_out; + + error = get_write_access(upperdentry->d_inode); if (error) goto mnt_drop_write_and_out; @@ -109,7 +120,7 @@ long vfs_truncate(const struct path *path, loff_t length) error = do_truncate(path->dentry, length, 0, NULL); put_write_and_out: - put_write_access(inode); + put_write_access(upperdentry->d_inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: @@ -726,7 +737,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; - error = break_lease(inode, f->f_flags); + error = break_lease(locks_inode(f), f->f_flags); if (error) goto cleanup_all; diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 324f0af40d7b..284373a57a08 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -177,8 +177,8 @@ static int orangefs_readdir(struct file *file, struct dir_context *ctx) } gossip_debug(GOSSIP_DIR_DEBUG, - "orangefs_readdir called on %s (pos=%llu)\n", - dentry->d_name.name, llu(pos)); + "orangefs_readdir called on %pd (pos=%llu)\n", + dentry, llu(pos)); memset(&readdir_response, 0, sizeof(readdir_response)); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 526040e09f78..f3c5b48e0432 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -585,8 +585,8 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) static int orangefs_file_release(struct inode *inode, struct file *file) { gossip_debug(GOSSIP_FILE_DEBUG, - "orangefs_file_release: called on %s\n", - file->f_path.dentry->d_name.name); + "orangefs_file_release: called on %pD\n", + file); orangefs_flush_inode(inode); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index cff00ebac03a..c83846fb9b14 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -129,8 +129,8 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_direct_IO: %s\n", - iocb->ki_filp->f_path.dentry->d_name.name); + "orangefs_direct_IO: %pD\n", + iocb->ki_filp); return -EINVAL; } @@ -216,8 +216,8 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = dentry->d_inode; gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_setattr: called on %s\n", - dentry->d_name.name); + "orangefs_setattr: called on %pd\n", + dentry); ret = setattr_prepare(dentry, iattr); if (ret) @@ -259,8 +259,8 @@ int orangefs_getattr(struct vfsmount *mnt, struct orangefs_inode_s *orangefs_inode = NULL; gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_getattr: called on %s\n", - dentry->d_name.name); + "orangefs_getattr: called on %pd\n", + dentry); ret = orangefs_inode_getattr(inode, 0, 0); if (ret == 0) { diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 62c525936ee8..a54390e45553 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -24,9 +24,9 @@ static int orangefs_create(struct inode *dir, struct inode *inode; int ret; - gossip_debug(GOSSIP_NAME_DEBUG, "%s: %s\n", + gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd\n", __func__, - dentry->d_name.name); + dentry); new_op = op_alloc(ORANGEFS_VFS_OP_CREATE); if (!new_op) @@ -43,9 +43,9 @@ static int orangefs_create(struct inode *dir, ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); gossip_debug(GOSSIP_NAME_DEBUG, - "%s: %s: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n", + "%s: %pd: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n", __func__, - dentry->d_name.name, + dentry, &new_op->downcall.resp.create.refn.khandle, new_op->downcall.resp.create.refn.fs_id, new_op, @@ -57,18 +57,18 @@ static int orangefs_create(struct inode *dir, inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &new_op->downcall.resp.create.refn); if (IS_ERR(inode)) { - gossip_err("%s: Failed to allocate inode for file :%s:\n", + gossip_err("%s: Failed to allocate inode for file :%pd:\n", __func__, - dentry->d_name.name); + dentry); ret = PTR_ERR(inode); goto out; } gossip_debug(GOSSIP_NAME_DEBUG, - "%s: Assigned inode :%pU: for file :%s:\n", + "%s: Assigned inode :%pU: for file :%pd:\n", __func__, get_khandle_from_ino(inode), - dentry->d_name.name); + dentry); d_instantiate(dentry, inode); unlock_new_inode(inode); @@ -76,9 +76,9 @@ static int orangefs_create(struct inode *dir, ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, - "%s: dentry instantiated for %s\n", + "%s: dentry instantiated for %pd\n", __func__, - dentry->d_name.name); + dentry); SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); @@ -87,9 +87,9 @@ static int orangefs_create(struct inode *dir, out: op_release(new_op); gossip_debug(GOSSIP_NAME_DEBUG, - "%s: %s: returning %d\n", + "%s: %pd: returning %d\n", __func__, - dentry->d_name.name, + dentry, ret); return ret; } @@ -115,8 +115,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, * -EEXIST on O_EXCL opens, which is broken if we skip this lookup * in the create path) */ - gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n", - __func__, dentry->d_name.name); + gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %pd\n", + __func__, dentry); if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1)) return ERR_PTR(-ENAMETOOLONG); @@ -169,9 +169,9 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, gossip_debug(GOSSIP_NAME_DEBUG, "orangefs_lookup: Adding *negative* dentry " - "%p for %s\n", + "%p for %pd\n", dentry, - dentry->d_name.name); + dentry); d_add(dentry, NULL); res = NULL; @@ -224,10 +224,10 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) int ret; gossip_debug(GOSSIP_NAME_DEBUG, - "%s: called on %s\n" + "%s: called on %pd\n" " (inode %pU): Parent is %pU | fs_id %d\n", __func__, - dentry->d_name.name, + dentry, get_khandle_from_ino(inode), &parent->refn.khandle, parent->refn.fs_id); @@ -326,9 +326,9 @@ static int orangefs_symlink(struct inode *dir, ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, - "Inode (Symlink) %pU -> %s\n", + "Inode (Symlink) %pU -> %pd\n", get_khandle_from_ino(inode), - dentry->d_name.name); + dentry); SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); @@ -390,9 +390,9 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, - "Inode (Directory) %pU -> %s\n", + "Inode (Directory) %pU -> %pd\n", get_khandle_from_ino(inode), - dentry->d_name.name); + dentry); /* * NOTE: we have no good way to keep nlink consistent for directories diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 1714a737d556..4f971551b56f 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -350,8 +350,8 @@ static ssize_t orangefs_debug_write(struct file *file, struct client_debug_mask c_mask = { NULL, 0, 0 }; gossip_debug(GOSSIP_DEBUGFS_DEBUG, - "orangefs_debug_write: %s\n", - file->f_path.dentry->d_name.name); + "orangefs_debug_write: %pD\n", + file); /* * Thwart users who try to jamb a ridiculous number diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 54e5d6681786..43fdc2765aea 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -80,6 +80,8 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) } for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + if (ovl_is_private_xattr(name)) + continue; retry: size = vfs_getxattr(old, name, value, value_size); if (size == -ERANGE) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 12bcd07b9e32..1560fdc09a5f 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -12,6 +12,8 @@ #include <linux/xattr.h> #include <linux/security.h> #include <linux/cred.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include "overlayfs.h" void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) @@ -186,6 +188,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; + if (!hardlink && !IS_POSIXACL(udir)) + stat->mode &= ~current_umask(); + inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); @@ -335,6 +340,32 @@ out_free: return ret; } +static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, + const struct posix_acl *acl) +{ + void *buffer; + size_t size; + int err; + + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) + return 0; + + size = posix_acl_to_xattr(NULL, acl, NULL, 0); + buffer = kmalloc(size, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + err = size; + if (err < 0) + goto out_free; + + err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE); +out_free: + kfree(buffer); + return err; +} + static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct kstat *stat, const char *link, struct dentry *hardlink) @@ -346,10 +377,18 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct dentry *upper; struct dentry *newdentry; int err; + struct posix_acl *acl, *default_acl; if (WARN_ON(!workdir)) return -EROFS; + if (!hardlink) { + err = posix_acl_create(dentry->d_parent->d_inode, + &stat->mode, &default_acl, &acl); + if (err) + return err; + } + err = ovl_lock_rename_workdir(workdir, upperdir); if (err) goto out; @@ -384,6 +423,17 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; } + if (!hardlink) { + err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS, + acl); + if (err) + goto out_cleanup; + + err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT, + default_acl); + if (err) + goto out_cleanup; + } if (!hardlink && S_ISDIR(stat->mode)) { err = ovl_set_opaque(newdentry); @@ -410,6 +460,10 @@ out_dput: out_unlock: unlock_rename(workdir, upperdir); out: + if (!hardlink) { + posix_acl_release(acl); + posix_acl_release(default_acl); + } return err; out_cleanup: @@ -950,9 +1004,9 @@ const struct inode_operations ovl_dir_inode_operations = { .permission = ovl_permission, .getattr = ovl_dir_getattr, .setxattr = generic_setxattr, - .getxattr = ovl_getxattr, + .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = ovl_removexattr, + .removexattr = generic_removexattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 990388dba9b8..251e5253f2c1 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -10,6 +10,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/posix_acl.h> #include "overlayfs.h" static int ovl_copy_up_truncate(struct dentry *dentry) @@ -191,32 +192,44 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) return err; } -static bool ovl_is_private_xattr(const char *name) +bool ovl_is_private_xattr(const char *name) { -#define OVL_XATTR_PRE_NAME OVL_XATTR_PREFIX "." - return strncmp(name, OVL_XATTR_PRE_NAME, - sizeof(OVL_XATTR_PRE_NAME) - 1) == 0; + return strncmp(name, OVL_XATTR_PREFIX, + sizeof(OVL_XATTR_PREFIX) - 1) == 0; } -int ovl_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) { int err; - struct dentry *upperdentry; + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); const struct cred *old_cred; err = ovl_want_write(dentry); if (err) goto out; + if (!value && !OVL_TYPE_UPPER(type)) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + goto out_drop_write; + } + err = ovl_copy_up(dentry); if (err) goto out_drop_write; - upperdentry = ovl_dentry_upper(dentry); + if (!OVL_TYPE_UPPER(type)) + ovl_path_upper(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_setxattr(upperdentry, name, value, size, flags); + if (value) + err = vfs_setxattr(realpath.dentry, name, value, size, flags); + else { + WARN_ON(flags != XATTR_REPLACE); + err = vfs_removexattr(realpath.dentry, name); + } revert_creds(old_cred); out_drop_write: @@ -225,16 +238,13 @@ out: return err; } -ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size) +int ovl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; const struct cred *old_cred; - if (ovl_is_private_xattr(name)) - return -ENODATA; - old_cred = ovl_override_creds(dentry->d_sb); res = vfs_getxattr(realdentry, name, value, size); revert_creds(old_cred); @@ -245,7 +255,8 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; - int off; + size_t len; + char *s; const struct cred *old_cred; old_cred = ovl_override_creds(dentry->d_sb); @@ -255,73 +266,39 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; /* filter out private xattrs */ - for (off = 0; off < res;) { - char *s = list + off; - size_t slen = strlen(s) + 1; + for (s = list, len = res; len;) { + size_t slen = strnlen(s, len) + 1; - BUG_ON(off + slen > res); + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > len)) + return -EIO; + len -= slen; if (ovl_is_private_xattr(s)) { res -= slen; - memmove(s, s + slen, res - off); + memmove(s, s + slen, len); } else { - off += slen; + s += slen; } } return res; } -int ovl_removexattr(struct dentry *dentry, const char *name) -{ - int err; - struct path realpath; - enum ovl_path_type type = ovl_path_real(dentry, &realpath); - const struct cred *old_cred; - - err = ovl_want_write(dentry); - if (err) - goto out; - - err = -ENODATA; - if (ovl_is_private_xattr(name)) - goto out_drop_write; - - if (!OVL_TYPE_UPPER(type)) { - err = vfs_getxattr(realpath.dentry, name, NULL, 0); - if (err < 0) - goto out_drop_write; - - err = ovl_copy_up(dentry); - if (err) - goto out_drop_write; - - ovl_path_upper(dentry, &realpath); - } - - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_removexattr(realpath.dentry, name); - revert_creds(old_cred); -out_drop_write: - ovl_drop_write(dentry); -out: - return err; -} - struct posix_acl *ovl_get_acl(struct inode *inode, int type) { struct inode *realinode = ovl_inode_real(inode, NULL); const struct cred *old_cred; struct posix_acl *acl; - if (!IS_POSIXACL(realinode)) + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; if (!realinode->i_op->get_acl) return NULL; old_cred = ovl_override_creds(inode->i_sb); - acl = realinode->i_op->get_acl(realinode, type); + acl = get_acl(realinode, type); revert_creds(old_cred); return acl; @@ -391,9 +368,9 @@ static const struct inode_operations ovl_file_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .setxattr = generic_setxattr, - .getxattr = ovl_getxattr, + .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = ovl_removexattr, + .removexattr = generic_removexattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, }; @@ -404,9 +381,9 @@ static const struct inode_operations ovl_symlink_inode_operations = { .readlink = ovl_readlink, .getattr = ovl_getattr, .setxattr = generic_setxattr, - .getxattr = ovl_getxattr, + .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = ovl_removexattr, + .removexattr = generic_removexattr, .update_time = ovl_update_time, }; @@ -415,6 +392,9 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode) inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; +#ifdef CONFIG_FS_POSIX_ACL + inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; +#endif mode &= S_IFMT; switch (mode) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e4f5c9536bfe..5813ccff8cd9 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -24,8 +24,8 @@ enum ovl_path_type { (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) -#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay" -#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX ".opaque" +#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." +#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" #define OVL_ISUPPER_MASK 1UL @@ -179,20 +179,21 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); int ovl_check_d_type_supported(struct path *realpath); +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level); /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); int ovl_permission(struct inode *inode, int mask); -int ovl_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags); -ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size); +int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int ovl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); -int ovl_removexattr(struct dentry *dentry, const char *name); struct posix_acl *ovl_get_acl(struct inode *inode, int type); int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); +bool ovl_is_private_xattr(const char *name); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode); struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index cf37fc76fc9f..f241b4ee3d8a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -248,7 +248,7 @@ static inline int ovl_dir_read(struct path *realpath, err = rdd->err; } while (!err && rdd->count); - if (!err && rdd->first_maybe_whiteout) + if (!err && rdd->first_maybe_whiteout && rdd->dentry) err = ovl_check_whiteouts(realpath->dentry, rdd); fput(realfile); @@ -606,3 +606,64 @@ int ovl_check_d_type_supported(struct path *realpath) return rdd.d_type_supported; } + +static void ovl_workdir_cleanup_recurse(struct path *path, int level) +{ + int err; + struct inode *dir = path->dentry->d_inode; + LIST_HEAD(list); + struct ovl_cache_entry *p; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = NULL, + .list = &list, + .root = RB_ROOT, + .is_lowest = false, + }; + + err = ovl_dir_read(path, &rdd); + if (err) + goto out; + + inode_lock_nested(dir, I_MUTEX_PARENT); + list_for_each_entry(p, &list, l_node) { + struct dentry *dentry; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + dentry = lookup_one_len(p->name, path->dentry, p->len); + if (IS_ERR(dentry)) + continue; + if (dentry->d_inode) + ovl_workdir_cleanup(dir, path->mnt, dentry, level); + dput(dentry); + } + inode_unlock(dir); +out: + ovl_cache_free(&list); +} + +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level) +{ + int err; + + if (!d_is_dir(dentry) || level > 1) { + ovl_cleanup(dir, dentry); + return; + } + + err = ovl_do_rmdir(dir, dentry); + if (err) { + struct path path = { .mnt = mnt, .dentry = dentry }; + + inode_unlock(dir); + ovl_workdir_cleanup_recurse(&path, level + 1); + inode_lock_nested(dir, I_MUTEX_PARENT); + ovl_cleanup(dir, dentry); + } +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 4036132842b5..3d0b9dee2b76 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -814,6 +814,10 @@ retry: struct kstat stat = { .mode = S_IFDIR | 0, }; + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat.mode, + }; if (work->d_inode) { err = -EEXIST; @@ -821,7 +825,7 @@ retry: goto out_dput; retried = true; - ovl_cleanup(dir, work); + ovl_workdir_cleanup(dir, mnt, work, 0); dput(work); goto retry; } @@ -829,6 +833,21 @@ retry: err = ovl_create_real(dir, work, &stat, NULL, NULL, true); if (err) goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + /* Clear any inherited mode bits */ + inode_lock(work->d_inode); + err = notify_change(work, &attr, NULL); + inode_unlock(work->d_inode); + if (err) + goto out_dput; } out_unlock: inode_unlock(dir); @@ -967,10 +986,19 @@ static unsigned int ovl_split_lowerdirs(char *str) return ctr; } -static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +static int __maybe_unused +ovl_posix_acl_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, handler->name, buffer, size); +} + +static int __maybe_unused +ovl_posix_acl_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { struct dentry *workdir = ovl_workdir(dentry); struct inode *realinode = ovl_inode_real(inode, NULL); @@ -998,19 +1026,22 @@ static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler, posix_acl_release(acl); - return ovl_setxattr(dentry, inode, handler->name, value, size, flags); + err = ovl_xattr_set(dentry, handler->name, value, size, flags); + if (!err) + ovl_copyattr(ovl_inode_real(inode, NULL), inode); + + return err; out_acl_release: posix_acl_release(acl); return err; } -static int ovl_other_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +static int ovl_own_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) { - return ovl_setxattr(dentry, inode, name, value, size, flags); + return -EPERM; } static int ovl_own_xattr_set(const struct xattr_handler *handler, @@ -1021,42 +1052,59 @@ static int ovl_own_xattr_set(const struct xattr_handler *handler, return -EPERM; } -static const struct xattr_handler ovl_posix_acl_access_xattr_handler = { +static int ovl_other_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, name, buffer, size); +} + +static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ovl_xattr_set(dentry, name, value, size, flags); +} + +static const struct xattr_handler __maybe_unused +ovl_posix_acl_access_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, + .get = ovl_posix_acl_xattr_get, .set = ovl_posix_acl_xattr_set, }; -static const struct xattr_handler ovl_posix_acl_default_xattr_handler = { +static const struct xattr_handler __maybe_unused +ovl_posix_acl_default_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, + .get = ovl_posix_acl_xattr_get, .set = ovl_posix_acl_xattr_set, }; static const struct xattr_handler ovl_own_xattr_handler = { .prefix = OVL_XATTR_PREFIX, + .get = ovl_own_xattr_get, .set = ovl_own_xattr_set, }; static const struct xattr_handler ovl_other_xattr_handler = { .prefix = "", /* catch all */ + .get = ovl_other_xattr_get, .set = ovl_other_xattr_set, }; static const struct xattr_handler *ovl_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL &ovl_posix_acl_access_xattr_handler, &ovl_posix_acl_default_xattr_handler, +#endif &ovl_own_xattr_handler, &ovl_other_xattr_handler, NULL }; -static const struct xattr_handler *ovl_xattr_noacl_handlers[] = { - &ovl_own_xattr_handler, - &ovl_other_xattr_handler, - NULL, -}; - static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { NULL, NULL }; @@ -1132,7 +1180,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); if (stacklen > OVL_MAX_STACK) { - pr_err("overlayfs: too many lower directries, limit is %d\n", + pr_err("overlayfs: too many lower directories, limit is %d\n", OVL_MAX_STACK); goto out_free_lowertmp; } else if (!ufs->config.upperdir && stacklen == 1) { @@ -1269,13 +1317,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_op = &ovl_super_operations; - if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) - sb->s_xattr = ovl_xattr_handlers; - else - sb->s_xattr = ovl_xattr_noacl_handlers; + sb->s_xattr = ovl_xattr_handlers; sb->s_root = root_dentry; sb->s_fs_info = ufs; - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; return 0; diff --git a/fs/pipe.c b/fs/pipe.c index 4b32928f5426..4ebe6b2e5217 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -144,10 +144,8 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - if (memcg_kmem_enabled()) { + if (memcg_kmem_enabled()) memcg_kmem_uncharge(page, 0); - __ClearPageKmemcg(page); - } __SetPageLocked(page); return 0; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index bfc3ec388322..da3f760a7e88 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -598,13 +598,14 @@ posix_acl_create(struct inode *dir, umode_t *mode, if (IS_ERR(p)) return PTR_ERR(p); + ret = -ENOMEM; clone = posix_acl_clone(p, GFP_NOFS); if (!clone) - goto no_mem; + goto err_release; ret = posix_acl_create_masq(clone, mode); if (ret < 0) - goto no_mem_clone; + goto err_release_clone; if (ret == 0) posix_acl_release(clone); @@ -618,11 +619,11 @@ posix_acl_create(struct inode *dir, umode_t *mode, return 0; -no_mem_clone: +err_release_clone: posix_acl_release(clone); -no_mem: +err_release: posix_acl_release(p); - return -ENOMEM; + return ret; } EXPORT_SYMBOL_GPL(posix_acl_create); @@ -664,15 +665,15 @@ static void posix_acl_fix_xattr_userns( struct user_namespace *to, struct user_namespace *from, void *value, size_t size) { - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; int count; kuid_t uid; kgid_t gid; if (!value) return; - if (size < sizeof(posix_acl_xattr_header)) + if (size < sizeof(struct posix_acl_xattr_header)) return; if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) return; @@ -722,15 +723,15 @@ struct posix_acl * posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, size_t size) { - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + const struct posix_acl_xattr_header *header = value; + const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end; int count; struct posix_acl *acl; struct posix_acl_entry *acl_e; if (!value) return NULL; - if (size < sizeof(posix_acl_xattr_header)) + if (size < sizeof(struct posix_acl_xattr_header)) return ERR_PTR(-EINVAL); if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) return ERR_PTR(-EOPNOTSUPP); @@ -791,8 +792,8 @@ int posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size) { - posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; - posix_acl_xattr_entry *ext_entry; + struct posix_acl_xattr_header *ext_acl = buffer; + struct posix_acl_xattr_entry *ext_entry; int real_size, n; real_size = posix_acl_xattr_size(acl->a_count); @@ -801,7 +802,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, if (real_size > size) return -ERANGE; - ext_entry = ext_acl->a_entries; + ext_entry = (void *)(ext_acl + 1); ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (n=0; n < acl->a_count; n++, ext_entry++) { diff --git a/fs/proc/base.c b/fs/proc/base.c index 207f36039f63..09f0f173821a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -400,23 +400,6 @@ static const struct file_operations proc_pid_cmdline_ops = { .llseek = generic_file_llseek, }; -static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task) -{ - struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); - if (mm && !IS_ERR(mm)) { - unsigned int nwords = 0; - do { - nwords += 2; - } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ - seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); - mmput(mm); - return 0; - } else - return PTR_ERR(mm); -} - - #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. @@ -1014,6 +997,30 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; +static int auxv_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); +} + +static ssize_t auxv_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + unsigned int nwords = 0; + do { + nwords += 2; + } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ + return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv, + nwords * sizeof(mm->saved_auxv[0])); +} + +static const struct file_operations proc_auxv_operations = { + .open = auxv_open, + .read = auxv_read, + .llseek = generic_file_llseek, + .release = mem_release, +}; + static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -1556,18 +1563,13 @@ static const struct file_operations proc_pid_set_comm_operations = { static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; - struct mm_struct *mm; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; - mm = get_task_mm(task); + exe_file = get_task_exe_file(task); put_task_struct(task); - if (!mm) - return -ENOENT; - exe_file = get_mm_exe_file(mm); - mmput(mm); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); @@ -2827,7 +2829,7 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), - ONE("auxv", S_IRUSR, proc_pid_auxv), + REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), @@ -3215,7 +3217,7 @@ static const struct pid_entry tid_base_stuff[] = { DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), - ONE("auxv", S_IRUSR, proc_pid_auxv), + REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 01df23cc81f6..d21dafef3102 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -31,7 +31,7 @@ static int seq_show(struct seq_file *m, void *v) put_task_struct(task); if (files) { - int fd = proc_fd(m->private); + unsigned int fd = proc_fd(m->private); spin_lock(&files->file_lock); file = fcheck_files(files, fd); @@ -86,7 +86,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) struct task_struct *task; const struct cred *cred; struct inode *inode; - int fd; + unsigned int fd; if (flags & LOOKUP_RCU) return -ECHILD; @@ -158,7 +158,7 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) } if (files) { - int fd = proc_fd(d_inode(dentry)); + unsigned int fd = proc_fd(d_inode(dentry)); struct file *fd_file; spin_lock(&files->file_lock); @@ -253,7 +253,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, continue; rcu_read_unlock(); - len = snprintf(name, sizeof(name), "%d", fd); + len = snprintf(name, sizeof(name), "%u", fd); if (!proc_fill_cache(file, ctx, name, len, instantiate, p, (void *)(unsigned long)fd)) diff --git a/fs/proc/fd.h b/fs/proc/fd.h index 7c047f256ae2..46dafadd0083 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -11,7 +11,7 @@ extern const struct inode_operations proc_fdinfo_inode_operations; extern int proc_fd_permission(struct inode *inode, int mask); -static inline int proc_fd(struct inode *inode) +static inline unsigned int proc_fd(struct inode *inode) { return PROC_I(inode)->fd; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7931c558c192..5378441ec1b7 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -60,7 +60,7 @@ union proc_op { struct proc_inode { struct pid *pid; - int fd; + unsigned int fd; union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index a939f5ed7f89..5c89a07e3d7f 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) static ssize_t read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { + char *buf = file->private_data; ssize_t acc = 0; size_t size, tsz; size_t elf_buflen; @@ -500,23 +501,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (clear_user(buffer, tsz)) return -EFAULT; } else if (is_vmalloc_or_module_addr((void *)start)) { - char * elf_buf; - - elf_buf = kzalloc(tsz, GFP_KERNEL); - if (!elf_buf) - return -ENOMEM; - vread(elf_buf, (char *)start, tsz); + vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, elf_buf, tsz)) { - kfree(elf_buf); + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; - } - kfree(elf_buf); } else { if (kern_addr_valid(start)) { unsigned long n; - n = copy_to_user(buffer, (char *)start, tsz); + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + memcpy(buf, (char *) start, tsz); + n = copy_to_user(buffer, buf, tsz); /* * We cannot distinguish between fault on source * and fault on destination. When this happens @@ -549,6 +547,11 @@ static int open_kcore(struct inode *inode, struct file *filp) { if (!capable(CAP_SYS_RAWIO)) return -EPERM; + + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -559,10 +562,16 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } +static int release_kcore(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, + .release = release_kcore, .llseek = default_llseek, }; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 09e18fdf61e5..b9a8c813e5e6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -46,7 +46,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) cached = 0; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_page_state(NR_LRU_BASE + lru); + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 187d84ef9de9..f6fa99eca515 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, mss->anonymous_thp += HPAGE_PMD_SIZE; else if (PageSwapBacked(page)) mss->shmem_thp += HPAGE_PMD_SIZE; + else if (is_zone_device_page(page)) + /* pass */; else VM_BUG_ON_PAGE(1, page); smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 183a212694bf..12af0490322f 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -27,9 +27,17 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/ramfs.h> +#include <linux/sched.h> #include "internal.h" +static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + const struct file_operations ramfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, @@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, + .get_unmapped_area = ramfs_mmu_get_unmapped_area, }; const struct inode_operations ramfs_file_inode_operations = { diff --git a/fs/seq_file.c b/fs/seq_file.c index 19f532e7d35e..6dc4296eed62 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -223,8 +223,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) size -= n; buf += n; copied += n; - if (!m->count) + if (!m->count) { + m->from = 0; m->index++; + } if (!size) goto Done; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f35523d4fa3a..b803213d1307 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -114,9 +114,15 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, * If buf != of->prealloc_buf, we don't know how * large it is, so cannot safely pass it to ->show */ - if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) + if (WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); + if (pos) { + if (len <= pos) + return 0; + len -= pos; + memmove(buf, buf + pos, len); + } return min(count, len); } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index b45345d701e7..51157da3f76e 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -370,7 +370,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) p = c->gap_lebs; do { - ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); + ubifs_assert(p < c->gap_lebs + c->lst.idx_lebs); written = layout_leb_in_gaps(c, p); if (written < 0) { err = written; diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e237811f09ce..11a004114eba 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -575,7 +575,8 @@ static int ubifs_xattr_get(const struct xattr_handler *handler, dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, inode->i_ino, dentry, size); - return __ubifs_getxattr(inode, name, buffer, size); + name = xattr_full_name(handler, name); + return __ubifs_getxattr(inode, name, buffer, size); } static int ubifs_xattr_set(const struct xattr_handler *handler, @@ -586,6 +587,8 @@ static int ubifs_xattr_set(const struct xattr_handler *handler, dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name, inode->i_ino, dentry, size); + name = xattr_full_name(handler, name); + if (value) return __ubifs_setxattr(inode, name, value, size, flags); else diff --git a/fs/utimes.c b/fs/utimes.c index c84b26e46bc5..22307cdf7014 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -87,21 +87,7 @@ static int utimes_common(struct path *path, struct timespec *times) */ newattrs.ia_valid |= ATTR_TIMES_SET; } else { - /* - * If times is NULL (or both times are UTIME_NOW), - * then we need to check permissions, because - * setattr_prepare() won't do it. - */ - error = -EPERM; - if (IS_IMMUTABLE(inode)) - goto mnt_drop_write_and_out; - - error = -EACCES; - if (!inode_owner_or_capable(inode)) { - error = inode_permission(inode, MAY_WRITE); - if (error) - goto mnt_drop_write_and_out; - } + newattrs.ia_valid |= ATTR_TOUCH; } retry_deleg: inode_lock(inode); @@ -113,7 +99,6 @@ retry_deleg: goto retry_deleg; } -mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 776ae2f325d1..05b5243d89f6 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1582,6 +1582,7 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t *flenp, /* result length */ int *stat) /* status: 0-freelist, 1-normal/none */ { + struct xfs_owner_info oinfo; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1624,6 +1625,18 @@ xfs_alloc_ag_vextent_small( error0); args->wasfromfl = 1; trace_xfs_alloc_small_freelist(args); + + /* + * If we're feeding an AGFL block to something that + * doesn't live in the free space, we need to clear + * out the OWN_AG rmap. + */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + error = xfs_rmap_free(args->tp, args->agbp, args->agno, + fbno, 1, &oinfo); + if (error) + goto error0; + *stat = 0; return 0; } @@ -2264,6 +2277,9 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_longest), offsetof(xfs_agf_t, agf_btreeblks), offsetof(xfs_agf_t, agf_uuid), + offsetof(xfs_agf_t, agf_rmap_blocks), + /* needed so that we don't log the whole rest of the structure: */ + offsetof(xfs_agf_t, agf_spare64), sizeof(xfs_agf_t) }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index b5c213a051cd..08569792fe20 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1814,6 +1814,10 @@ xfs_btree_lookup( XFS_BTREE_STATS_INC(cur, lookup); + /* No such thing as a zero-level tree. */ + if (cur->bc_nlevels == 0) + return -EFSCORRUPTED; + block = NULL; keyno = 0; @@ -4554,15 +4558,22 @@ xfs_btree_simple_query_range( if (error) goto out; + /* Nothing? See if there's anything to the right. */ + if (!stat) { + error = xfs_btree_increment(cur, 0, &stat); + if (error) + goto out; + } + while (stat) { /* Find the record. */ error = xfs_btree_get_rec(cur, &recp, &stat); if (error || !stat) break; - cur->bc_ops->init_high_key_from_rec(&rec_key, recp); /* Skip if high_key(rec) < low_key. */ if (firstrec) { + cur->bc_ops->init_high_key_from_rec(&rec_key, recp); firstrec = false; diff = cur->bc_ops->diff_two_keys(cur, low_key, &rec_key); @@ -4571,6 +4582,7 @@ xfs_btree_simple_query_range( } /* Stop if high_key < low_key(rec). */ + cur->bc_ops->init_key_from_rec(&rec_key, recp); diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); if (diff > 0) break; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 054a2032fdb3..c221d0ecd52e 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -194,7 +194,7 @@ xfs_defer_trans_abort( /* Abort intent items. */ list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { trace_xfs_defer_pending_abort(tp->t_mountp, dfp); - if (dfp->dfp_committed) + if (!dfp->dfp_done) dfp->dfp_type->abort_intent(dfp->dfp_intent); } @@ -290,7 +290,6 @@ xfs_defer_finish( struct xfs_defer_pending *dfp; struct list_head *li; struct list_head *n; - void *done_item = NULL; void *state; int error = 0; void (*cleanup_fn)(struct xfs_trans *, void *, int); @@ -309,19 +308,11 @@ xfs_defer_finish( if (error) goto out; - /* Mark all pending intents as committed. */ - list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) { - if (dfp->dfp_committed) - break; - trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp); - dfp->dfp_committed = true; - } - /* Log an intent-done item for the first pending item. */ dfp = list_first_entry(&dop->dop_pending, struct xfs_defer_pending, dfp_list); trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, + dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, dfp->dfp_count); cleanup_fn = dfp->dfp_type->finish_cleanup; @@ -331,7 +322,7 @@ xfs_defer_finish( list_del(li); dfp->dfp_count--; error = dfp->dfp_type->finish_item(*tp, dop, li, - done_item, &state); + dfp->dfp_done, &state); if (error) { /* * Clean up after ourselves and jump out. @@ -428,8 +419,8 @@ xfs_defer_add( dfp = kmem_alloc(sizeof(struct xfs_defer_pending), KM_SLEEP | KM_NOFS); dfp->dfp_type = defer_op_types[type]; - dfp->dfp_committed = false; dfp->dfp_intent = NULL; + dfp->dfp_done = NULL; dfp->dfp_count = 0; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, &dop->dop_intake); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index cc3981c48296..e96533d178cf 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -30,8 +30,8 @@ struct xfs_defer_op_type; struct xfs_defer_pending { const struct xfs_defer_op_type *dfp_type; /* function pointers */ struct list_head dfp_list; /* pending items */ - bool dfp_committed; /* committed trans? */ void *dfp_intent; /* log intent item */ + void *dfp_done; /* log done item */ struct list_head dfp_work; /* work items */ unsigned int dfp_count; /* # extent items */ }; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index f814d42c73b2..270fb5cf4fa1 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -640,12 +640,15 @@ typedef struct xfs_agf { __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ uuid_t agf_uuid; /* uuid of filesystem */ + __be32 agf_rmap_blocks; /* rmapbt blocks used */ + __be32 agf_padding; /* padding */ + /* * reserve some contiguous space for future logged fields before we add * the unlogged fields. This makes the range logging via flags and * structure offsets much simpler. */ - __be64 agf_spare64[16]; + __be64 agf_spare64[15]; /* unlogged fields, written during buffer writeback. */ __be64 agf_lsn; /* last write sequence */ @@ -670,7 +673,9 @@ typedef struct xfs_agf { #define XFS_AGF_LONGEST 0x00000400 #define XFS_AGF_BTREEBLKS 0x00000800 #define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_NUM_BITS 13 +#define XFS_AGF_RMAP_BLOCKS 0x00002000 +#define XFS_AGF_SPARE64 0x00004000 +#define XFS_AGF_NUM_BITS 15 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -686,7 +691,9 @@ typedef struct xfs_agf { { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ { XFS_AGF_LONGEST, "LONGEST" }, \ { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ - { XFS_AGF_UUID, "UUID" } + { XFS_AGF_UUID, "UUID" }, \ + { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \ + { XFS_AGF_SPARE64, "SPARE64" } /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index bc1faebc84ec..17b8eeb34ac8 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -98,6 +98,8 @@ xfs_rmapbt_alloc_block( union xfs_btree_ptr *new, int *stat) { + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); int error; xfs_agblock_t bno; @@ -124,6 +126,8 @@ xfs_rmapbt_alloc_block( xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); + be32_add_cpu(&agf->agf_rmap_blocks, 1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; @@ -143,6 +147,8 @@ xfs_rmapbt_free_block( bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, bno, 1); + be32_add_cpu(&agf->agf_rmap_blocks, -1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 0e3d4f5ec33c..4aecc5fefe96 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -583,7 +583,8 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + return xfs_mount_validate_sb(mp, &sb, + bp->b_maps[0].bm_bn == XFS_SB_DADDR, check_version); } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 47a318ce82e0..b5b9bffe3520 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -115,7 +115,6 @@ xfs_buf_ioacct_dec( if (!(bp->b_flags & _XBF_IN_FLIGHT)) return; - ASSERT(bp->b_flags & XBF_ASYNC); bp->b_flags &= ~_XBF_IN_FLIGHT; percpu_counter_dec(&bp->b_target->bt_io_count); } @@ -1612,7 +1611,7 @@ xfs_wait_buftarg( */ while (percpu_counter_sum(&btp->bt_io_count)) delay(100); - drain_workqueue(btp->bt_mount->m_buf_workqueue); + flush_workqueue(btp->bt_mount->m_buf_workqueue); /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 79205202a29a..fcb81ede2cb0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -741,9 +741,20 @@ xfs_file_dax_write( * page is inserted into the pagecache when we have to serve a write * fault on a hole. It should never be dirtied and can simply be * dropped from the pagecache once we get real data for the page. + * + * XXX: This is racy against mmap, and there's nothing we can do about + * it. dax_do_io() should really do this invalidation internally as + * it will know if we've allocated over a holei for this specific IO and + * if so it needs to update the mapping tree and invalidate existing + * PTEs over the newly allocated range. Remove this invalidation when + * dax_do_io() is fixed up. */ if (mapping->nrpages) { - ret = invalidate_inode_pages2(mapping); + loff_t end = iocb->ki_pos + iov_iter_count(from) - 1; + + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, + end >> PAGE_SHIFT); WARN_ON_ONCE(ret); } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 0f96847b90e1..0b7f986745c1 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -248,6 +248,7 @@ xfs_growfs_data_private( agf->agf_roots[XFS_BTNUM_RMAPi] = cpu_to_be32(XFS_RMAP_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + agf->agf_rmap_blocks = cpu_to_be32(1); } agf->agf_flfirst = cpu_to_be32(1); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2114d53df433..2af0dda1c978 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -715,12 +715,16 @@ xfs_iomap_write_allocate( * is in the delayed allocation extent on which we sit * but before our buffer starts. */ - nimaps = 0; while (nimaps == 0) { nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres, + /* + * We have already reserved space for the extent and any + * indirect blocks when creating the delalloc extent, + * there is no need to reserve space in this transaction + * again. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; @@ -1037,20 +1041,14 @@ xfs_file_iomap_begin( return error; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); - xfs_bmbt_to_iomap(ip, iomap, &imap); - } else if (nimaps) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); - xfs_bmbt_to_iomap(ip, iomap, &imap); } else { + ASSERT(nimaps); + xfs_iunlock(ip, XFS_ILOCK_EXCL); - trace_xfs_iomap_not_found(ip, offset, length, 0, &imap); - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - iomap->offset = offset; - iomap->length = length; + trace_xfs_iomap_found(ip, offset, length, 0, &imap); } + xfs_bmbt_to_iomap(ip, iomap, &imap); return 0; } @@ -1112,3 +1110,48 @@ struct iomap_ops xfs_iomap_ops = { .iomap_begin = xfs_file_iomap_begin, .iomap_end = xfs_file_iomap_end, }; + +static int +xfs_xattr_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + struct xfs_bmbt_irec imap; + int nimaps = 1, error = 0; + unsigned lockmode; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lockmode = xfs_ilock_data_map_shared(ip); + + /* if there are no attribute fork or extents, return ENOENT */ + if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + error = -ENOENT; + goto out_unlock; + } + + ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK); +out_unlock: + xfs_iunlock(ip, lockmode); + + if (!error) { + ASSERT(nimaps); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } + + return error; +} + +struct iomap_ops xfs_xattr_iomap_ops = { + .iomap_begin = xfs_xattr_iomap_begin, +}; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index e066d045e2ff..fb8aca3d69ab 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -35,5 +35,6 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); extern struct iomap_ops xfs_iomap_ops; +extern struct iomap_ops xfs_xattr_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 6d0d5d413fad..ba99803eba98 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1039,7 +1039,14 @@ xfs_vn_fiemap( int error; xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED); - error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops); + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; + error = iomap_fiemap(inode, fieinfo, start, length, + &xfs_xattr_iomap_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, length, + &xfs_iomap_ops); + } xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); return error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 24ef83ef04de..fd6be45b3a1e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1574,9 +1574,16 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (mp->m_sb.sb_rblocks) { + xfs_alert(mp, + "EXPERIMENTAL reverse mapping btree not compatible with realtime device!"); + error = -EINVAL; + goto out_filestream_unmount; + } xfs_alert(mp, "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); + } error = xfs_mountfs(mp); if (error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 551b7e26980c..d303a665dba9 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1298,7 +1298,6 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); DEFINE_IOMAP_EVENT(xfs_iomap_alloc); DEFINE_IOMAP_EVENT(xfs_iomap_found); -DEFINE_IOMAP_EVENT(xfs_iomap_not_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -2296,7 +2295,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __entry->dev = mp ? mp->m_super->s_dev : 0; __entry->type = dfp->dfp_type->type; __entry->intent = dfp->dfp_intent; - __entry->committed = dfp->dfp_committed; + __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n", |