diff options
Diffstat (limited to 'fs')
82 files changed, 1555 insertions, 1093 deletions
diff --git a/fs/afs/cell.c b/fs/afs/cell.c index a2a87117d262..fd5133e26a38 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -74,6 +74,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, cell = rcu_dereference_raw(net->ws_cell); if (cell) { afs_get_cell(cell); + ret = 0; break; } ret = -EDESTADDRREQ; @@ -108,6 +109,9 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, done_seqretry(&net->cells_lock, seq); + if (ret != 0 && cell) + afs_put_cell(net, cell); + return ret == 0 ? cell : ERR_PTR(ret); } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 4f1b6f466ff5..b86195e4dc6c 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -505,18 +505,14 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) struct afs_call *call = container_of(work, struct afs_call, work); struct afs_uuid *r = call->request; - struct { - __be32 match; - } reply; - _enter(""); if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0) - reply.match = htonl(0); + afs_send_empty_reply(call); else - reply.match = htonl(1); + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + 1, 1, "K-1"); - afs_send_simple_reply(call, &reply, sizeof(reply)); afs_put_call(call); _leave(""); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index e640d67274be..139b4e3cc946 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -440,7 +440,7 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, * iterate through the data blob that lists the contents of an AFS directory */ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, - struct key *key) + struct key *key, afs_dataversion_t *_dir_version) { struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_xdr_dir_page *dbuf; @@ -460,6 +460,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, req = afs_read_dir(dvnode, key); if (IS_ERR(req)) return PTR_ERR(req); + *_dir_version = req->data_version; /* round the file position up to the next entry boundary */ ctx->pos += sizeof(union afs_xdr_dirent) - 1; @@ -514,7 +515,10 @@ out: */ static int afs_readdir(struct file *file, struct dir_context *ctx) { - return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file)); + afs_dataversion_t dir_version; + + return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file), + &dir_version); } /* @@ -555,7 +559,8 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, * - just returns the FID the dentry name maps to if found */ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, - struct afs_fid *fid, struct key *key) + struct afs_fid *fid, struct key *key, + afs_dataversion_t *_dir_version) { struct afs_super_info *as = dir->i_sb->s_fs_info; struct afs_lookup_one_cookie cookie = { @@ -568,7 +573,7 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); /* search the directory */ - ret = afs_dir_iterate(dir, &cookie.ctx, key); + ret = afs_dir_iterate(dir, &cookie.ctx, key, _dir_version); if (ret < 0) { _leave(" = %d [iter]", ret); return ret; @@ -642,6 +647,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, struct afs_server *server; struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct inode *inode = NULL, *ti; + afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version); int ret, i; _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); @@ -669,12 +675,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, cookie->fids[i].vid = as->volume->vid; /* search the directory */ - ret = afs_dir_iterate(dir, &cookie->ctx, key); + ret = afs_dir_iterate(dir, &cookie->ctx, key, &data_version); if (ret < 0) { inode = ERR_PTR(ret); goto out; } + dentry->d_fsdata = (void *)(unsigned long)data_version; + inode = ERR_PTR(-ENOENT); if (!cookie->found) goto out; @@ -951,7 +959,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, inode ? AFS_FS_I(inode) : NULL); } else { trace_afs_lookup(dvnode, &dentry->d_name, - inode ? AFS_FS_I(inode) : NULL); + IS_ERR_OR_NULL(inode) ? NULL + : AFS_FS_I(inode)); } return d; } @@ -968,7 +977,8 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) struct dentry *parent; struct inode *inode; struct key *key; - long dir_version, de_version; + afs_dataversion_t dir_version; + long de_version; int ret; if (flags & LOOKUP_RCU) @@ -1014,20 +1024,20 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) * on a 32-bit system, we only have 32 bits in the dentry to store the * version. */ - dir_version = (long)dir->status.data_version; + dir_version = dir->status.data_version; de_version = (long)dentry->d_fsdata; - if (de_version == dir_version) - goto out_valid; + if (de_version == (long)dir_version) + goto out_valid_noupdate; - dir_version = (long)dir->invalid_before; - if (de_version - dir_version >= 0) + dir_version = dir->invalid_before; + if (de_version - (long)dir_version >= 0) goto out_valid; _debug("dir modified"); afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key); + ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version); switch (ret) { case 0: /* the filename maps to something */ @@ -1080,7 +1090,8 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) } out_valid: - dentry->d_fsdata = (void *)dir_version; + dentry->d_fsdata = (void *)(unsigned long)dir_version; +out_valid_noupdate: dput(parent); key_put(key); _leave(" = 1 [valid]"); @@ -1186,6 +1197,20 @@ static void afs_prep_for_new_inode(struct afs_fs_cursor *fc, } /* + * Note that a dentry got changed. We need to set d_fsdata to the data version + * number derived from the result of the operation. It doesn't matter if + * d_fsdata goes backwards as we'll just revalidate. + */ +static void afs_update_dentry_version(struct afs_fs_cursor *fc, + struct dentry *dentry, + struct afs_status_cb *scb) +{ + if (fc->ac.error == 0) + dentry->d_fsdata = + (void *)(unsigned long)scb->status.data_version; +} + +/* * create a directory on an AFS filesystem */ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -1227,6 +1252,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) afs_check_for_remote_deletion(&fc, dvnode); afs_vnode_commit_status(&fc, dvnode, fc.cb_break, &data_version, &scb[0]); + afs_update_dentry_version(&fc, dentry, &scb[0]); afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]); ret = afs_end_vnode_operation(&fc); if (ret < 0) @@ -1319,6 +1345,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) afs_vnode_commit_status(&fc, dvnode, fc.cb_break, &data_version, scb); + afs_update_dentry_version(&fc, dentry, scb); ret = afs_end_vnode_operation(&fc); if (ret == 0) { afs_dir_remove_subdir(dentry); @@ -1458,6 +1485,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) &data_version, &scb[0]); afs_vnode_commit_status(&fc, vnode, fc.cb_break_2, &data_version_2, &scb[1]); + afs_update_dentry_version(&fc, dentry, &scb[0]); ret = afs_end_vnode_operation(&fc); if (ret == 0 && !(scb[1].have_status || scb[1].have_error)) ret = afs_dir_remove_link(dvnode, dentry, key); @@ -1526,6 +1554,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, afs_check_for_remote_deletion(&fc, dvnode); afs_vnode_commit_status(&fc, dvnode, fc.cb_break, &data_version, &scb[0]); + afs_update_dentry_version(&fc, dentry, &scb[0]); afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]); ret = afs_end_vnode_operation(&fc); if (ret < 0) @@ -1607,6 +1636,7 @@ static int afs_link(struct dentry *from, struct inode *dir, afs_vnode_commit_status(&fc, vnode, fc.cb_break_2, NULL, &scb[1]); ihold(&vnode->vfs_inode); + afs_update_dentry_version(&fc, dentry, &scb[0]); d_instantiate(dentry, &vnode->vfs_inode); mutex_unlock(&vnode->io_lock); @@ -1686,6 +1716,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, afs_check_for_remote_deletion(&fc, dvnode); afs_vnode_commit_status(&fc, dvnode, fc.cb_break, &data_version, &scb[0]); + afs_update_dentry_version(&fc, dentry, &scb[0]); afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]); ret = afs_end_vnode_operation(&fc); if (ret < 0) @@ -1791,6 +1822,17 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + /* This bit is potentially nasty as there's a potential race with + * afs_d_revalidate{,_rcu}(). We have to change d_fsdata on the dentry + * to reflect it's new parent's new data_version after the op, but + * d_revalidate may see old_dentry between the op having taken place + * and the version being updated. + * + * So drop the old_dentry for now to make other threads go through + * lookup instead - which we hold a lock against. + */ + d_drop(old_dentry); + ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, orig_dvnode, key, true)) { afs_dataversion_t orig_data_version; @@ -1802,9 +1844,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, if (orig_dvnode != new_dvnode) { if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) { afs_end_vnode_operation(&fc); - goto error_rehash; + goto error_rehash_old; } - new_data_version = new_dvnode->status.data_version; + new_data_version = new_dvnode->status.data_version + 1; } else { new_data_version = orig_data_version; new_scb = &scb[0]; @@ -1827,7 +1869,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, } ret = afs_end_vnode_operation(&fc); if (ret < 0) - goto error_rehash; + goto error_rehash_old; } if (ret == 0) { @@ -1853,10 +1895,26 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, drop_nlink(new_inode); spin_unlock(&new_inode->i_lock); } + + /* Now we can update d_fsdata on the dentries to reflect their + * new parent's data_version. + * + * Note that if we ever implement RENAME_EXCHANGE, we'll have + * to update both dentries with opposing dir versions. + */ + if (new_dvnode != orig_dvnode) { + afs_update_dentry_version(&fc, old_dentry, &scb[1]); + afs_update_dentry_version(&fc, new_dentry, &scb[1]); + } else { + afs_update_dentry_version(&fc, old_dentry, &scb[0]); + afs_update_dentry_version(&fc, new_dentry, &scb[0]); + } d_move(old_dentry, new_dentry); goto error_tmp; } +error_rehash_old: + d_rehash(new_dentry); error_rehash: if (rehash) d_rehash(rehash); diff --git a/fs/afs/file.c b/fs/afs/file.c index 56b69576274d..dd3c55c9101c 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -191,11 +191,13 @@ void afs_put_read(struct afs_read *req) int i; if (refcount_dec_and_test(&req->usage)) { - for (i = 0; i < req->nr_pages; i++) - if (req->pages[i]) - put_page(req->pages[i]); - if (req->pages != req->array) - kfree(req->pages); + if (req->pages) { + for (i = 0; i < req->nr_pages; i++) + if (req->pages[i]) + put_page(req->pages[i]); + if (req->pages != req->array) + kfree(req->pages); + } kfree(req); } } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index d7e0fd3c00df..cfb0ac4bd039 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -56,23 +56,24 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) struct afs_uuid__xdr *xdr; struct afs_uuid *uuid; int j; + int n = entry->nr_servers; tmp = ntohl(uvldb->serverFlags[i]); if (tmp & AFS_VLSF_DONTUSE || (new_only && !(tmp & AFS_VLSF_NEWREPSITE))) continue; if (tmp & AFS_VLSF_RWVOL) { - entry->fs_mask[i] |= AFS_VOL_VTM_RW; + entry->fs_mask[n] |= AFS_VOL_VTM_RW; if (vlflags & AFS_VLF_BACKEXISTS) - entry->fs_mask[i] |= AFS_VOL_VTM_BAK; + entry->fs_mask[n] |= AFS_VOL_VTM_BAK; } if (tmp & AFS_VLSF_ROVOL) - entry->fs_mask[i] |= AFS_VOL_VTM_RO; - if (!entry->fs_mask[i]) + entry->fs_mask[n] |= AFS_VOL_VTM_RO; + if (!entry->fs_mask[n]) continue; xdr = &uvldb->serverNumber[i]; - uuid = (struct afs_uuid *)&entry->fs_server[i]; + uuid = (struct afs_uuid *)&entry->fs_server[n]; uuid->time_low = xdr->time_low; uuid->time_mid = htons(ntohl(xdr->time_mid)); uuid->time_hi_and_version = htons(ntohl(xdr->time_hi_and_version)); diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 2575503170fc..ca2452806ebf 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -2171,7 +2171,7 @@ int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *fc, const struct afs_acl *acl key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); size = round_up(acl->size, 4); - call = afs_alloc_flat_call(net, &yfs_RXYFSStoreStatus, + call = afs_alloc_flat_call(net, &yfs_RXYFSStoreOpaqueACL2, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid) + sizeof(__be32) + size, diff --git a/fs/block_dev.c b/fs/block_dev.c index a6f7c892cb4a..677cb364d33f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -345,24 +345,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct bio *bio; bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; - bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - gfp_t gfp; - ssize_t ret; + int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - if (nowait) - gfp = GFP_NOWAIT; - else - gfp = GFP_KERNEL; - - bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool); - if (!bio) - return -EAGAIN; + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); @@ -384,10 +375,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) if (!is_poll) blk_start_plug(&plug); - ret = 0; for (;;) { - int err; - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; bio->bi_write_hint = iocb->ki_hint; @@ -395,10 +383,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; - err = bio_iov_iter_get_pages(bio, iter); - if (unlikely(err)) { - if (!ret) - ret = err; + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; @@ -413,14 +399,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) task_io_account_write(bio->bi_iter.bi_size); } - /* - * Tell underlying layer to not block for resource shortage. - * And if we would have blocked, return error inline instead - * of through the bio->bi_end_io() callback. - */ - if (nowait) - bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE); - dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; @@ -434,12 +412,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } qc = submit_bio(bio); - if (qc == BLK_QC_T_EAGAIN) { - if (!ret) - ret = -EAGAIN; - goto error; - } - ret = dio->size; if (polled) WRITE_ONCE(iocb->ki_cookie, qc); @@ -460,20 +432,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) atomic_inc(&dio->ref); } - qc = submit_bio(bio); - if (qc == BLK_QC_T_EAGAIN) { - if (!ret) - ret = -EAGAIN; - goto error; - } - ret = dio->size; - - bio = bio_alloc(gfp, nr_pages); - if (!bio) { - if (!ret) - ret = -EAGAIN; - goto error; - } + submit_bio(bio); + bio = bio_alloc(GFP_KERNEL, nr_pages); } if (!is_poll) @@ -493,16 +453,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); -out: if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); + if (likely(!ret)) + ret = dio->size; bio_put(&dio->bio); return ret; -error: - if (!is_poll) - blk_finish_plug(&plug); - goto out; } static ssize_t @@ -1754,7 +1711,10 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) /* finish claiming */ mutex_lock(&bdev->bd_mutex); - bd_finish_claiming(bdev, whole, holder); + if (!res) + bd_finish_claiming(bdev, whole, holder); + else + bd_abort_claiming(bdev, whole, holder); /* * Block event polling for write claims if requested. Any * write holder makes the write_holder state stick until diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 299e11e6c554..94660063a162 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -401,7 +401,6 @@ struct btrfs_dev_replace { struct raid_kobject { u64 flags; struct kobject kobj; - struct list_head list; }; /* @@ -915,8 +914,6 @@ struct btrfs_fs_info { u32 thread_pool_size; struct kobject *space_info_kobj; - struct list_head pending_raid_kobjs; - spinlock_t pending_raid_kobjs_lock; /* uncontended */ u64 total_pinned; @@ -2698,7 +2695,6 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size); -void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5f7ee70b3d1a..97beb351a10c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2683,8 +2683,6 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); - INIT_LIST_HEAD(&fs_info->pending_raid_kobjs); - spin_lock_init(&fs_info->pending_raid_kobjs_lock); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d3b58e388535..8b7eb22d508a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4,6 +4,7 @@ */ #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/writeback.h> @@ -7888,33 +7889,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) return 0; } -/* link_block_group will queue up kobjects to add when we're reclaim-safe */ -void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *space_info; - struct raid_kobject *rkobj; - LIST_HEAD(list); - int ret = 0; - - spin_lock(&fs_info->pending_raid_kobjs_lock); - list_splice_init(&fs_info->pending_raid_kobjs, &list); - spin_unlock(&fs_info->pending_raid_kobjs_lock); - - list_for_each_entry(rkobj, &list, list) { - space_info = btrfs_find_space_info(fs_info, rkobj->flags); - - ret = kobject_add(&rkobj->kobj, &space_info->kobj, - "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); - if (ret) { - kobject_put(&rkobj->kobj); - break; - } - } - if (ret) - btrfs_warn(fs_info, - "failed to add kobject for block cache, ignoring"); -} - static void link_block_group(struct btrfs_block_group_cache *cache) { struct btrfs_space_info *space_info = cache->space_info; @@ -7929,18 +7903,36 @@ static void link_block_group(struct btrfs_block_group_cache *cache) up_write(&space_info->groups_sem); if (first) { - struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + struct raid_kobject *rkobj; + unsigned int nofs_flag; + int ret; + + /* + * Setup a NOFS context because kobject_add(), deep in its call + * chain, does GFP_KERNEL allocations, and we are often called + * in a context where if reclaim is triggered we can deadlock + * (we are either holding a transaction handle or some lock + * required for a transaction commit). + */ + nofs_flag = memalloc_nofs_save(); + rkobj = kzalloc(sizeof(*rkobj), GFP_KERNEL); if (!rkobj) { + memalloc_nofs_restore(nofs_flag); btrfs_warn(cache->fs_info, "couldn't alloc memory for raid level kobject"); return; } rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); - - spin_lock(&fs_info->pending_raid_kobjs_lock); - list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs); - spin_unlock(&fs_info->pending_raid_kobjs_lock); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", + btrfs_bg_type_to_raid_name(rkobj->flags)); + memalloc_nofs_restore(nofs_flag); + if (ret) { + kobject_put(&rkobj->kobj); + btrfs_warn(fs_info, + "failed to add kobject for block cache, ignoring"); + return; + } space_info->block_group_kobjs[index] = &rkobj->kobj; } } @@ -8206,7 +8198,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) inc_block_group_ro(cache, 1); } - btrfs_add_raid_kobjects(info); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); error: @@ -8975,6 +8966,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) struct btrfs_device *device; struct list_head *devices; u64 group_trimmed; + u64 range_end = U64_MAX; u64 start; u64 end; u64 trimmed = 0; @@ -8984,16 +8976,23 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) int dev_ret = 0; int ret = 0; + /* + * Check range overflow if range->len is set. + * The default range->len is U64_MAX. + */ + if (range->len != U64_MAX && + check_add_overflow(range->start, range->len, &range_end)) + return -EINVAL; + cache = btrfs_lookup_first_block_group(fs_info, range->start); for (; cache; cache = next_block_group(cache)) { - if (cache->key.objectid >= (range->start + range->len)) { + if (cache->key.objectid >= range_end) { btrfs_put_block_group(cache); break; } start = max(range->start, cache->key.objectid); - end = min(range->start + range->len, - cache->key.objectid + cache->key.offset); + end = min(range_end, cache->key.objectid + cache->key.offset); if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1ff438fd5bc2..eeb75281894e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3628,6 +3628,13 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + /* * Lock eb pages and flush the bio if we can't the locks * @@ -3699,8 +3706,11 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) { + int err; + + err = flush_write_bio(epd); + if (err < 0) { + ret = err; failed_page_nr = i; goto err_unlock; } @@ -3715,16 +3725,23 @@ err_unlock: /* Unlock already locked pages */ for (i = 0; i < failed_page_nr; i++) unlock_page(eb->pages[i]); + /* + * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. + * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can + * be made and undo everything done before. + */ + btrfs_tree_lock(eb); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + end_extent_buffer_writeback(eb); + spin_unlock(&eb->refs_lock); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, + fs_info->dirty_metadata_batch); + btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_tree_unlock(eb); return ret; } -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} - static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6c8297bcfeb7..1bfd7e34f31e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4985,7 +4985,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, BTRFS_I(inode), LOG_OTHER_INODE_ALL, 0, LLONG_MAX, ctx); - iput(inode); + btrfs_add_delayed_iput(inode); } } continue; @@ -5000,7 +5000,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE, 0, LLONG_MAX, ctx); if (ret) { - iput(inode); + btrfs_add_delayed_iput(inode); continue; } @@ -5009,7 +5009,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - iput(inode); + btrfs_add_delayed_iput(inode); continue; } @@ -5056,7 +5056,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, } path->slots[0]++; } - iput(inode); + btrfs_add_delayed_iput(inode); } return ret; @@ -5689,7 +5689,7 @@ process_leaf: } if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { - iput(di_inode); + btrfs_add_delayed_iput(di_inode); break; } @@ -5701,7 +5701,7 @@ process_leaf: if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; - iput(di_inode); + btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; if (ctx->log_new_dentries) { @@ -5848,7 +5848,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); - iput(dir_inode); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -5891,7 +5891,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); - iput(inode); + btrfs_add_delayed_iput(inode); if (ret) return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d74b74ca07af..a447d3ec48d5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3087,16 +3087,6 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (ret) return ret; - /* - * We add the kobjects here (and after forcing data chunk creation) - * since relocation is the only place we'll create chunks of a new - * type at runtime. The only place where we'll remove the last - * chunk of a type is the call immediately below this one. Even - * so, we're protected against races with the cleaner thread since - * we're covered by the delete_unused_bgs_mutex. - */ - btrfs_add_raid_kobjects(fs_info); - trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { @@ -3223,9 +3213,6 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, btrfs_end_transaction(trans); if (ret < 0) return ret; - - btrfs_add_raid_kobjects(fs_info); - return 1; } } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e078cc55b989..b3c8b886bf64 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -913,8 +913,9 @@ get_more_pages: if (page_offset(page) >= ceph_wbc.i_size) { dout("%p page eof %llu\n", page, ceph_wbc.i_size); - if (ceph_wbc.size_stable || - page_offset(page) >= i_size_read(inode)) + if ((ceph_wbc.size_stable || + page_offset(page) >= i_size_read(inode)) && + clear_page_dirty_for_io(page)) mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d98dcd976c80..ce0f5658720a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1301,6 +1301,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; + struct ceph_buffer *old_blob = NULL; struct cap_msg_args arg; int held, revoking; int wake = 0; @@ -1365,7 +1366,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ci->i_requested_max_size = arg.max_size; if (flushing & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); + old_blob = __ceph_build_xattrs_blob(ci); arg.xattr_version = ci->i_xattrs.version; arg.xattr_buf = ci->i_xattrs.blob; } else { @@ -1409,6 +1410,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); + ret = send_cap_msg(&arg); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 791f84a13bb8..18500edefc56 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -736,6 +736,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, int issued, new_issued, info_caps; struct timespec64 mtime, atime, ctime; struct ceph_buffer *xattr_blob = NULL; + struct ceph_buffer *old_blob = NULL; struct ceph_string *pool_ns = NULL; struct ceph_cap *new_cap = NULL; int err = 0; @@ -881,7 +882,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { if (ci->i_xattrs.blob) - ceph_buffer_put(ci->i_xattrs.blob); + old_blob = ci->i_xattrs.blob; ci->i_xattrs.blob = xattr_blob; if (xattr_blob) memcpy(ci->i_xattrs.blob->vec.iov_base, @@ -1022,8 +1023,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page, out: if (new_cap) ceph_put_cap(mdsc, new_cap); - if (xattr_blob) - ceph_buffer_put(xattr_blob); + ceph_buffer_put(old_blob); + ceph_buffer_put(xattr_blob); ceph_put_string(pool_ns); return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ac9b53b89365..5083e238ad15 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -111,8 +111,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, req->r_wait_for_completion = ceph_lock_wait_for_completion; err = ceph_mdsc_do_request(mdsc, inode, req); - - if (operation == CEPH_MDS_OP_GETFILELOCK) { + if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 4c6494eb02b5..ccfcc66aaf44 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -465,6 +465,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; struct ceph_snap_context *old_snapc, *new_snapc; + struct ceph_buffer *old_blob = NULL; int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); @@ -541,7 +542,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->gid = inode->i_gid; if (dirty & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); + old_blob = __ceph_build_xattrs_blob(ci); capsnap->xattr_blob = ceph_buffer_get(ci->i_xattrs.blob); capsnap->xattr_version = ci->i_xattrs.version; @@ -584,6 +585,7 @@ update_snapc: } spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); kfree(capsnap); ceph_put_snap_context(old_snapc); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d2352fd95dbc..6b9f1ee7de85 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -926,7 +926,7 @@ extern int ceph_getattr(const struct path *path, struct kstat *stat, int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); -extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); +extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); extern const struct xattr_handler *ceph_xattr_handlers[]; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 37b458a9af3a..939eab7aa219 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -754,12 +754,15 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, /* * If there are dirty xattrs, reencode xattrs into the prealloc_blob - * and swap into place. + * and swap into place. It returns the old i_xattrs.blob (or NULL) so + * that it can be freed by the caller as the i_ceph_lock is likely to be + * held. */ -void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) +struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci) { struct rb_node *p; struct ceph_inode_xattr *xattr = NULL; + struct ceph_buffer *old_blob = NULL; void *dest; dout("__build_xattrs_blob %p\n", &ci->vfs_inode); @@ -790,12 +793,14 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) dest - ci->i_xattrs.prealloc_blob->vec.iov_base; if (ci->i_xattrs.blob) - ceph_buffer_put(ci->i_xattrs.blob); + old_blob = ci->i_xattrs.blob; ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = NULL; ci->i_xattrs.dirty = false; ci->i_xattrs.version++; } + + return old_blob; } static inline int __get_request_mask(struct inode *in) { @@ -1036,6 +1041,7 @@ int __ceph_setxattr(struct inode *inode, const char *name, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf = NULL; + struct ceph_buffer *old_blob = NULL; int issued; int err; int dirty = 0; @@ -1109,13 +1115,15 @@ retry: struct ceph_buffer *blob; spin_unlock(&ci->i_ceph_lock); - dout(" preaallocating new blob size=%d\n", required_blob_size); + ceph_buffer_put(old_blob); /* Shouldn't be required */ + dout(" pre-allocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); + /* prealloc_blob can't be released while holding i_ceph_lock */ if (ci->i_xattrs.prealloc_blob) - ceph_buffer_put(ci->i_xattrs.prealloc_blob); + old_blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = blob; goto retry; } @@ -1131,6 +1139,7 @@ retry: } spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); if (lock_snap_rwsem) up_read(&mdsc->snap_rwsem); if (dirty) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 4b21a90015a9..99caf77df4a2 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,5 +152,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.21" +#define CIFS_VERSION "2.22" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e23234207fc2..592a6cea2b79 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -579,6 +579,7 @@ extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset); void extract_unc_hostname(const char *unc, const char **h, size_t *len); +int copy_path_name(char *dst, const char *src); #ifdef CONFIG_CIFS_DFS_UPCALL static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e2f95965065d..3907653e63c7 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -942,10 +942,8 @@ PsxDelete: PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB add path length overrun check */ - name_len = strnlen(fileName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, fileName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, fileName); } params = 6 + name_len; @@ -1015,10 +1013,8 @@ DelFileRetry: remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve check for buffer overruns BB */ - name_len = strnlen(name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->fileName, name, name_len); + } else { + name_len = copy_path_name(pSMB->fileName, name); } pSMB->SearchAttributes = cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM); @@ -1062,10 +1058,8 @@ RmDirRetry: remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve check for buffer overruns BB */ - name_len = strnlen(name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->DirName, name, name_len); + } else { + name_len = copy_path_name(pSMB->DirName, name); } pSMB->BufferFormat = 0x04; @@ -1107,10 +1101,8 @@ MkDirRetry: remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve check for buffer overruns BB */ - name_len = strnlen(name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->DirName, name, name_len); + } else { + name_len = copy_path_name(pSMB->DirName, name); } pSMB->BufferFormat = 0x04; @@ -1157,10 +1149,8 @@ PsxCreat: PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, name, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, name); } params = 6 + name_len; @@ -1324,11 +1314,9 @@ OldOpenRetry: fileName, PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve check for buffer overruns BB */ + } else { count = 0; /* no pad */ - name_len = strnlen(fileName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->fileName, fileName, name_len); + name_len = copy_path_name(pSMB->fileName, fileName); } if (*pOplock & REQ_OPLOCK) pSMB->OpenFlags = cpu_to_le16(REQ_OPLOCK); @@ -1442,11 +1430,8 @@ openRetry: /* BB improve check for buffer overruns BB */ /* no pad */ count = 0; - name_len = strnlen(path, PATH_MAX); - /* trailing null */ - name_len++; + name_len = copy_path_name(req->fileName, path); req->NameLength = cpu_to_le16(name_len); - strncpy(req->fileName, path, name_len); } if (*oplock & REQ_OPLOCK) @@ -2812,15 +2797,10 @@ renameRetry: remap); name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; name_len2 *= 2; /* convert to bytes */ - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(from_name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->OldFileName, from_name, name_len); - name_len2 = strnlen(to_name, PATH_MAX); - name_len2++; /* trailing null */ + } else { + name_len = copy_path_name(pSMB->OldFileName, from_name); + name_len2 = copy_path_name(pSMB->OldFileName+name_len+1, to_name); pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ - strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2); - name_len2++; /* trailing null */ name_len2++; /* signature byte */ } @@ -2962,15 +2942,10 @@ copyRetry: toName, PATH_MAX, nls_codepage, remap); name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; name_len2 *= 2; /* convert to bytes */ - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fromName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->OldFileName, fromName, name_len); - name_len2 = strnlen(toName, PATH_MAX); - name_len2++; /* trailing null */ + } else { + name_len = copy_path_name(pSMB->OldFileName, fromName); pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ - strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2); - name_len2++; /* trailing null */ + name_len2 = copy_path_name(pSMB->OldFileName+name_len+1, toName); name_len2++; /* signature byte */ } @@ -3021,10 +2996,8 @@ createSymLinkRetry: name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fromName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, fromName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, fromName); } params = 6 + name_len; pSMB->MaxSetupCount = 0; @@ -3044,10 +3017,8 @@ createSymLinkRetry: PATH_MAX, nls_codepage, remap); name_len_target++; /* trailing null */ name_len_target *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len_target = strnlen(toName, PATH_MAX); - name_len_target++; /* trailing null */ - strncpy(data_offset, toName, name_len_target); + } else { + name_len_target = copy_path_name(data_offset, toName); } pSMB->MaxParameterCount = cpu_to_le16(2); @@ -3109,10 +3080,8 @@ createHardLinkRetry: name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(toName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, toName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, toName); } params = 6 + name_len; pSMB->MaxSetupCount = 0; @@ -3131,10 +3100,8 @@ createHardLinkRetry: PATH_MAX, nls_codepage, remap); name_len_target++; /* trailing null */ name_len_target *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len_target = strnlen(fromName, PATH_MAX); - name_len_target++; /* trailing null */ - strncpy(data_offset, fromName, name_len_target); + } else { + name_len_target = copy_path_name(data_offset, fromName); } pSMB->MaxParameterCount = cpu_to_le16(2); @@ -3213,15 +3180,10 @@ winCreateHardLinkRetry: remap); name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; name_len2 *= 2; /* convert to bytes */ - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(from_name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->OldFileName, from_name, name_len); - name_len2 = strnlen(to_name, PATH_MAX); - name_len2++; /* trailing null */ + } else { + name_len = copy_path_name(pSMB->OldFileName, from_name); pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ - strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2); - name_len2++; /* trailing null */ + name_len2 = copy_path_name(pSMB->OldFileName+name_len+1, to_name); name_len2++; /* signature byte */ } @@ -3271,10 +3233,8 @@ querySymLinkRetry: remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(searchName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, searchName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, searchName); } params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; @@ -3691,10 +3651,8 @@ queryAclRetry: name_len *= 2; pSMB->FileName[name_len] = 0; pSMB->FileName[name_len+1] = 0; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(searchName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, searchName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, searchName); } params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; @@ -3776,10 +3734,8 @@ setAclRetry: PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fileName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, fileName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, fileName); } params = 6 + name_len; pSMB->MaxParameterCount = cpu_to_le16(2); @@ -4184,9 +4140,7 @@ QInfRetry: name_len++; /* trailing null */ name_len *= 2; } else { - name_len = strnlen(search_name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, search_name, name_len); + name_len = copy_path_name(pSMB->FileName, search_name); } pSMB->BufferFormat = 0x04; name_len++; /* account for buffer type byte */ @@ -4321,10 +4275,8 @@ QPathInfoRetry: PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(search_name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, search_name, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, search_name); } params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; @@ -4490,10 +4442,8 @@ UnixQPathInfoRetry: PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(searchName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, searchName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, searchName); } params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; @@ -4593,17 +4543,16 @@ findFirstRetry: pSMB->FileName[name_len+1] = 0; name_len += 2; } - } else { /* BB add check for overrun of SMB buf BB */ - name_len = strnlen(searchName, PATH_MAX); -/* BB fix here and in unicode clause above ie - if (name_len > buffersize-header) - free buffer exit; BB */ - strncpy(pSMB->FileName, searchName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, searchName); if (msearch) { - pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb); - pSMB->FileName[name_len+1] = '*'; - pSMB->FileName[name_len+2] = 0; - name_len += 3; + if (WARN_ON_ONCE(name_len > PATH_MAX-2)) + name_len = PATH_MAX-2; + /* overwrite nul byte */ + pSMB->FileName[name_len-1] = CIFS_DIR_SEP(cifs_sb); + pSMB->FileName[name_len] = '*'; + pSMB->FileName[name_len+1] = 0; + name_len += 2; } } @@ -4898,10 +4847,8 @@ GetInodeNumberRetry: remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(search_name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, search_name, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, search_name); } params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; @@ -5008,9 +4955,7 @@ getDFSRetry: name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(search_name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->RequestFileName, search_name, name_len); + name_len = copy_path_name(pSMB->RequestFileName, search_name); } if (ses->server->sign) @@ -5663,10 +5608,8 @@ SetEOFRetry: PATH_MAX, cifs_sb->local_nls, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(file_name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, file_name, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, file_name); } params = 6 + name_len; data_count = sizeof(struct file_end_of_file_info); @@ -5959,10 +5902,8 @@ SetTimesRetry: PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fileName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, fileName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, fileName); } params = 6 + name_len; @@ -6040,10 +5981,8 @@ SetAttrLgcyRetry: PATH_MAX, nls_codepage); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fileName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->fileName, fileName, name_len); + } else { + name_len = copy_path_name(pSMB->fileName, fileName); } pSMB->attr = cpu_to_le16(dos_attrs); pSMB->BufferFormat = 0x04; @@ -6203,10 +6142,8 @@ setPermsRetry: PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(file_name, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, file_name, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, file_name); } params = 6 + name_len; @@ -6298,10 +6235,8 @@ QAllEAsRetry: PATH_MAX, nls_codepage, remap); list_len++; /* trailing null */ list_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - list_len = strnlen(searchName, PATH_MAX); - list_len++; /* trailing null */ - strncpy(pSMB->FileName, searchName, list_len); + } else { + list_len = copy_path_name(pSMB->FileName, searchName); } params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */; @@ -6480,10 +6415,8 @@ SetEARetry: PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fileName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, fileName, name_len); + } else { + name_len = copy_path_name(pSMB->FileName, fileName); } params = 6 + name_len; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a4830ced0f98..5299effa6f7d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1113,6 +1113,7 @@ cifs_demultiplex_thread(void *p) mempool_resize(cifs_req_poolp, length + cifs_min_rcv); set_freezable(); + allow_kernel_signal(SIGKILL); while (server->tcpStatus != CifsExiting) { if (try_to_freeze()) continue; @@ -2980,6 +2981,7 @@ static int cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) { int rc = 0; + int is_domain = 0; const char *delim, *payload; char *desc; ssize_t len; @@ -3027,6 +3029,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) rc = PTR_ERR(key); goto out_err; } + is_domain = 1; } down_read(&key->sem); @@ -3084,6 +3087,26 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) goto out_key_put; } + /* + * If we have a domain key then we must set the domainName in the + * for the request. + */ + if (is_domain && ses->domainName) { + vol->domainname = kstrndup(ses->domainName, + strlen(ses->domainName), + GFP_KERNEL); + if (!vol->domainname) { + cifs_dbg(FYI, "Unable to allocate %zd bytes for " + "domain\n", len); + rc = -ENOMEM; + kfree(vol->username); + vol->username = NULL; + kzfree(vol->password); + vol->password = NULL; + goto out_key_put; + } + } + out_key_put: up_read(&key->sem); key_put(key); @@ -4208,16 +4231,19 @@ build_unc_path_to_root(const struct smb_vol *vol, strlen(vol->prepath) + 1 : 0; unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); + if (unc_len > MAX_TREE_SIZE) + return ERR_PTR(-EINVAL); + full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); if (full_path == NULL) return ERR_PTR(-ENOMEM); - strncpy(full_path, vol->UNC, unc_len); + memcpy(full_path, vol->UNC, unc_len); pos = full_path + unc_len; if (pplen) { *pos = CIFS_DIR_SEP(cifs_sb); - strncpy(pos + 1, vol->prepath, pplen); + memcpy(pos + 1, vol->prepath, pplen); pos += pplen; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index f26a48dd2e39..be424e81e3ad 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -69,11 +69,10 @@ cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, return full_path; if (dfsplen) - strncpy(full_path, tcon->treeName, dfsplen); + memcpy(full_path, tcon->treeName, dfsplen); full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb); - strncpy(full_path + dfsplen + 1, vol->prepath, pplen); + memcpy(full_path + dfsplen + 1, vol->prepath, pplen); convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); - full_path[dfsplen + pplen] = 0; /* add trailing null */ return full_path; } diff --git a/fs/cifs/export.c b/fs/cifs/export.c index ce8b7f677c58..eb0bb8ca8e63 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -24,7 +24,7 @@ */ /* - * See Documentation/filesystems/nfs/Exporting + * See Documentation/filesystems/nfs/exporting.rst * and examples in fs/exportfs * * Since cifs is a network file system, an "fsid" must be included for diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index f383877a6511..5ad83bdb9bea 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1011,3 +1011,25 @@ void extract_unc_hostname(const char *unc, const char **h, size_t *len) *h = unc; *len = end - unc; } + +/** + * copy_path_name - copy src path to dst, possibly truncating + * + * returns number of bytes written (including trailing nul) + */ +int copy_path_name(char *dst, const char *src) +{ + int name_len; + + /* + * PATH_MAX includes nul, so if strlen(src) >= PATH_MAX it + * will truncate and strlen(dst) will be PATH_MAX-1 + */ + name_len = strscpy(dst, src, PATH_MAX); + if (WARN_ON_ONCE(name_len < 0)) + name_len = PATH_MAX-1; + + /* we count the trailing nul */ + name_len++; + return name_len; +} diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index dcd49ad60c83..4c764ff7edd2 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -159,13 +159,16 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { char *bcc_ptr = *pbcc_area; + int len; /* copy user */ /* BB what about null user mounts - check that we do this BB */ /* copy user */ if (ses->user_name != NULL) { - strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN); - bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN); + len = strscpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN); + if (WARN_ON_ONCE(len < 0)) + len = CIFS_MAX_USERNAME_LEN - 1; + bcc_ptr += len; } /* else null user mount */ *bcc_ptr = 0; @@ -173,8 +176,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* copy domain */ if (ses->domainName != NULL) { - strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); - bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + if (WARN_ON_ONCE(len < 0)) + len = CIFS_MAX_DOMAINNAME_LEN - 1; + bcc_ptr += len; } /* else we will send a null domain name so the server will default to its own domain */ *bcc_ptr = 0; @@ -242,9 +247,10 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, kfree(ses->serverOS); - ses->serverOS = kzalloc(len + 1, GFP_KERNEL); + ses->serverOS = kmalloc(len + 1, GFP_KERNEL); if (ses->serverOS) { - strncpy(ses->serverOS, bcc_ptr, len); + memcpy(ses->serverOS, bcc_ptr, len); + ses->serverOS[len] = 0; if (strncmp(ses->serverOS, "OS/2", 4) == 0) cifs_dbg(FYI, "OS/2 server\n"); } @@ -258,9 +264,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, kfree(ses->serverNOS); - ses->serverNOS = kzalloc(len + 1, GFP_KERNEL); - if (ses->serverNOS) - strncpy(ses->serverNOS, bcc_ptr, len); + ses->serverNOS = kmalloc(len + 1, GFP_KERNEL); + if (ses->serverNOS) { + memcpy(ses->serverNOS, bcc_ptr, len); + ses->serverNOS[len] = 0; + } bcc_ptr += len + 1; bleft -= len + 1; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a5bc1b671c12..64a5864127be 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3489,7 +3489,15 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen) { - sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); + void *addr; + /* + * VMAP_STACK (at least) puts stack into the vmalloc address space + */ + if (is_vmalloc_addr(buf)) + addr = vmalloc_to_page(buf); + else + addr = virt_to_page(buf); + sg_set_page(sg, addr, buflen, offset_in_page(buf)); } /* Assumes the first rqst has a transform header as the first iov. @@ -4070,7 +4078,6 @@ receive_encrypted_standard(struct TCP_Server_Info *server, { int ret, length; char *buf = server->smallbuf; - char *tmpbuf; struct smb2_sync_hdr *shdr; unsigned int pdu_length = server->pdu_size; unsigned int buf_size; @@ -4100,18 +4107,15 @@ receive_encrypted_standard(struct TCP_Server_Info *server, return length; next_is_large = server->large_buf; - one_more: +one_more: shdr = (struct smb2_sync_hdr *)buf; if (shdr->NextCommand) { - if (next_is_large) { - tmpbuf = server->bigbuf; + if (next_is_large) next_buffer = (char *)cifs_buf_get(); - } else { - tmpbuf = server->smallbuf; + else next_buffer = (char *)cifs_small_buf_get(); - } memcpy(next_buffer, - tmpbuf + le32_to_cpu(shdr->NextCommand), + buf + le32_to_cpu(shdr->NextCommand), pdu_length - le32_to_cpu(shdr->NextCommand)); } @@ -4140,12 +4144,21 @@ receive_encrypted_standard(struct TCP_Server_Info *server, pdu_length -= le32_to_cpu(shdr->NextCommand); server->large_buf = next_is_large; if (next_is_large) - server->bigbuf = next_buffer; + server->bigbuf = buf = next_buffer; else - server->smallbuf = next_buffer; - - buf += le32_to_cpu(shdr->NextCommand); + server->smallbuf = buf = next_buffer; goto one_more; + } else if (ret != 0) { + /* + * ret != 0 here means that we didn't get to handle_mid() thus + * server->smallbuf and server->bigbuf are still valid. We need + * to free next_buffer because it is not going to be used + * anywhere. + */ + if (next_is_large) + free_rsp_buf(CIFS_LARGE_BUFFER, next_buffer); + else + free_rsp_buf(CIFS_SMALL_BUFFER, next_buffer); } return ret; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index c8cd7b6cdda2..31e4a1b0b170 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -252,7 +252,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (tcon == NULL) return 0; - if (smb2_command == SMB2_TREE_CONNECT) + if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL) return 0; if (tcon->tidStatus == CifsExiting) { @@ -1196,7 +1196,12 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) else req->SecurityMode = 0; +#ifdef CONFIG_CIFS_DFS_UPCALL + req->Capabilities = cpu_to_le32(SMB2_GLOBAL_CAP_DFS); +#else req->Capabilities = 0; +#endif /* DFS_UPCALL */ + req->Channel = 0; /* MBZ */ sess_data->iov[0].iov_base = (char *)req; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 6e30949d9f77..a7ec2d3dff92 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -638,9 +638,6 @@ COMPATIBLE_IOCTL(PPPIOCDISCONN) COMPATIBLE_IOCTL(PPPIOCATTCHAN) COMPATIBLE_IOCTL(PPPIOCGCHAN) COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) -/* PPPOX */ -COMPATIBLE_IOCTL(PPPOEIOCSFWD) -COMPATIBLE_IOCTL(PPPOEIOCDFWD) /* Big A */ /* sparc only */ /* Big Q for sound/OSS */ diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index f752d83a9c44..520f1813e789 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -20,6 +20,15 @@ #include <linux/list.h> #include <linux/spinlock.h> +struct configfs_fragment { + atomic_t frag_count; + struct rw_semaphore frag_sem; + bool frag_dead; +}; + +void put_fragment(struct configfs_fragment *); +struct configfs_fragment *get_fragment(struct configfs_fragment *); + struct configfs_dirent { atomic_t s_count; int s_dependent_count; @@ -34,6 +43,7 @@ struct configfs_dirent { #ifdef CONFIG_LOCKDEP int s_depth; #endif + struct configfs_fragment *s_frag; }; #define CONFIGFS_ROOT 0x0001 @@ -61,8 +71,8 @@ extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct in extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_create_bin_file(struct config_item *, const struct configfs_bin_attribute *); -extern int configfs_make_dirent(struct configfs_dirent *, - struct dentry *, void *, umode_t, int); +extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, + void *, umode_t, int, struct configfs_fragment *); extern int configfs_dirent_is_ready(struct configfs_dirent *); extern void configfs_hash_and_remove(struct dentry * dir, const char * name); @@ -137,6 +147,7 @@ static inline void release_configfs_dirent(struct configfs_dirent * sd) { if (!(sd->s_type & CONFIGFS_ROOT)) { kfree(sd->s_iattr); + put_fragment(sd->s_frag); kmem_cache_free(configfs_dir_cachep, sd); } } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 92112915de8e..79fc25aaa8cd 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -151,11 +151,38 @@ configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) #endif /* CONFIG_LOCKDEP */ +static struct configfs_fragment *new_fragment(void) +{ + struct configfs_fragment *p; + + p = kmalloc(sizeof(struct configfs_fragment), GFP_KERNEL); + if (p) { + atomic_set(&p->frag_count, 1); + init_rwsem(&p->frag_sem); + p->frag_dead = false; + } + return p; +} + +void put_fragment(struct configfs_fragment *frag) +{ + if (frag && atomic_dec_and_test(&frag->frag_count)) + kfree(frag); +} + +struct configfs_fragment *get_fragment(struct configfs_fragment *frag) +{ + if (likely(frag)) + atomic_inc(&frag->frag_count); + return frag; +} + /* * Allocates a new configfs_dirent and links it to the parent configfs_dirent */ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd, - void *element, int type) + void *element, int type, + struct configfs_fragment *frag) { struct configfs_dirent * sd; @@ -175,6 +202,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren kmem_cache_free(configfs_dir_cachep, sd); return ERR_PTR(-ENOENT); } + sd->s_frag = get_fragment(frag); list_add(&sd->s_sibling, &parent_sd->s_children); spin_unlock(&configfs_dirent_lock); @@ -209,11 +237,11 @@ static int configfs_dirent_exists(struct configfs_dirent *parent_sd, int configfs_make_dirent(struct configfs_dirent * parent_sd, struct dentry * dentry, void * element, - umode_t mode, int type) + umode_t mode, int type, struct configfs_fragment *frag) { struct configfs_dirent * sd; - sd = configfs_new_dirent(parent_sd, element, type); + sd = configfs_new_dirent(parent_sd, element, type, frag); if (IS_ERR(sd)) return PTR_ERR(sd); @@ -260,7 +288,8 @@ static void init_symlink(struct inode * inode) * until it is validated by configfs_dir_set_ready() */ -static int configfs_create_dir(struct config_item *item, struct dentry *dentry) +static int configfs_create_dir(struct config_item *item, struct dentry *dentry, + struct configfs_fragment *frag) { int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; @@ -273,7 +302,8 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry) return error; error = configfs_make_dirent(p->d_fsdata, dentry, item, mode, - CONFIGFS_DIR | CONFIGFS_USET_CREATING); + CONFIGFS_DIR | CONFIGFS_USET_CREATING, + frag); if (unlikely(error)) return error; @@ -338,9 +368,10 @@ int configfs_create_link(struct configfs_symlink *sl, { int err = 0; umode_t mode = S_IFLNK | S_IRWXUGO; + struct configfs_dirent *p = parent->d_fsdata; - err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, - CONFIGFS_ITEM_LINK); + err = configfs_make_dirent(p, dentry, sl, mode, + CONFIGFS_ITEM_LINK, p->s_frag); if (!err) { err = configfs_create(dentry, mode, init_symlink); if (err) { @@ -599,7 +630,8 @@ static int populate_attrs(struct config_item *item) static int configfs_attach_group(struct config_item *parent_item, struct config_item *item, - struct dentry *dentry); + struct dentry *dentry, + struct configfs_fragment *frag); static void configfs_detach_group(struct config_item *item); static void detach_groups(struct config_group *group) @@ -647,7 +679,8 @@ static void detach_groups(struct config_group *group) * try using vfs_mkdir. Just a thought. */ static int create_default_group(struct config_group *parent_group, - struct config_group *group) + struct config_group *group, + struct configfs_fragment *frag) { int ret; struct configfs_dirent *sd; @@ -663,7 +696,7 @@ static int create_default_group(struct config_group *parent_group, d_add(child, NULL); ret = configfs_attach_group(&parent_group->cg_item, - &group->cg_item, child); + &group->cg_item, child, frag); if (!ret) { sd = child->d_fsdata; sd->s_type |= CONFIGFS_USET_DEFAULT; @@ -677,13 +710,14 @@ static int create_default_group(struct config_group *parent_group, return ret; } -static int populate_groups(struct config_group *group) +static int populate_groups(struct config_group *group, + struct configfs_fragment *frag) { struct config_group *new_group; int ret = 0; list_for_each_entry(new_group, &group->default_groups, group_entry) { - ret = create_default_group(group, new_group); + ret = create_default_group(group, new_group, frag); if (ret) { detach_groups(group); break; @@ -797,11 +831,12 @@ static void link_group(struct config_group *parent_group, struct config_group *g */ static int configfs_attach_item(struct config_item *parent_item, struct config_item *item, - struct dentry *dentry) + struct dentry *dentry, + struct configfs_fragment *frag) { int ret; - ret = configfs_create_dir(item, dentry); + ret = configfs_create_dir(item, dentry, frag); if (!ret) { ret = populate_attrs(item); if (ret) { @@ -831,12 +866,13 @@ static void configfs_detach_item(struct config_item *item) static int configfs_attach_group(struct config_item *parent_item, struct config_item *item, - struct dentry *dentry) + struct dentry *dentry, + struct configfs_fragment *frag) { int ret; struct configfs_dirent *sd; - ret = configfs_attach_item(parent_item, item, dentry); + ret = configfs_attach_item(parent_item, item, dentry, frag); if (!ret) { sd = dentry->d_fsdata; sd->s_type |= CONFIGFS_USET_DIR; @@ -852,7 +888,7 @@ static int configfs_attach_group(struct config_item *parent_item, */ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); - ret = populate_groups(to_config_group(item)); + ret = populate_groups(to_config_group(item), frag); if (ret) { configfs_detach_item(item); d_inode(dentry)->i_flags |= S_DEAD; @@ -1247,6 +1283,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct configfs_dirent *sd; const struct config_item_type *type; struct module *subsys_owner = NULL, *new_item_owner = NULL; + struct configfs_fragment *frag; char *name; sd = dentry->d_parent->d_fsdata; @@ -1265,6 +1302,12 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode goto out; } + frag = new_fragment(); + if (!frag) { + ret = -ENOMEM; + goto out; + } + /* Get a working ref for the duration of this function */ parent_item = configfs_get_config_item(dentry->d_parent); type = parent_item->ci_type; @@ -1367,9 +1410,9 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode spin_unlock(&configfs_dirent_lock); if (group) - ret = configfs_attach_group(parent_item, item, dentry); + ret = configfs_attach_group(parent_item, item, dentry, frag); else - ret = configfs_attach_item(parent_item, item, dentry); + ret = configfs_attach_item(parent_item, item, dentry, frag); spin_lock(&configfs_dirent_lock); sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; @@ -1406,6 +1449,7 @@ out_put: * reference. */ config_item_put(parent_item); + put_fragment(frag); out: return ret; @@ -1417,6 +1461,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) struct config_item *item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; + struct configfs_fragment *frag; struct module *subsys_owner = NULL, *dead_item_owner = NULL; int ret; @@ -1474,6 +1519,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) } } while (ret == -EAGAIN); + frag = sd->s_frag; + if (down_write_killable(&frag->frag_sem)) { + spin_lock(&configfs_dirent_lock); + configfs_detach_rollback(dentry); + spin_unlock(&configfs_dirent_lock); + return -EINTR; + } + frag->frag_dead = true; + up_write(&frag->frag_sem); + /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); @@ -1574,7 +1629,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) */ err = -ENOENT; if (configfs_dirent_is_ready(parent_sd)) { - file->private_data = configfs_new_dirent(parent_sd, NULL, 0); + file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL); if (IS_ERR(file->private_data)) err = PTR_ERR(file->private_data); else @@ -1732,8 +1787,13 @@ int configfs_register_group(struct config_group *parent_group, { struct configfs_subsystem *subsys = parent_group->cg_subsys; struct dentry *parent; + struct configfs_fragment *frag; int ret; + frag = new_fragment(); + if (!frag) + return -ENOMEM; + mutex_lock(&subsys->su_mutex); link_group(parent_group, group); mutex_unlock(&subsys->su_mutex); @@ -1741,7 +1801,7 @@ int configfs_register_group(struct config_group *parent_group, parent = parent_group->cg_item.ci_dentry; inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - ret = create_default_group(parent_group, group); + ret = create_default_group(parent_group, group, frag); if (ret) goto err_out; @@ -1749,12 +1809,14 @@ int configfs_register_group(struct config_group *parent_group, configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); inode_unlock(d_inode(parent)); + put_fragment(frag); return 0; err_out: inode_unlock(d_inode(parent)); mutex_lock(&subsys->su_mutex); unlink_group(group); mutex_unlock(&subsys->su_mutex); + put_fragment(frag); return ret; } EXPORT_SYMBOL(configfs_register_group); @@ -1770,16 +1832,12 @@ void configfs_unregister_group(struct config_group *group) struct configfs_subsystem *subsys = group->cg_subsys; struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_fragment *frag = sd->s_frag; - mutex_lock(&subsys->su_mutex); - if (!group->cg_item.ci_parent->ci_group) { - /* - * The parent has already been unlinked and detached - * due to a rmdir. - */ - goto unlink_group; - } - mutex_unlock(&subsys->su_mutex); + down_write(&frag->frag_sem); + frag->frag_dead = true; + up_write(&frag->frag_sem); inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); @@ -1796,7 +1854,6 @@ void configfs_unregister_group(struct config_group *group) dput(dentry); mutex_lock(&subsys->su_mutex); -unlink_group: unlink_group(group); mutex_unlock(&subsys->su_mutex); } @@ -1853,10 +1910,17 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) struct dentry *dentry; struct dentry *root; struct configfs_dirent *sd; + struct configfs_fragment *frag; + + frag = new_fragment(); + if (!frag) + return -ENOMEM; root = configfs_pin_fs(); - if (IS_ERR(root)) + if (IS_ERR(root)) { + put_fragment(frag); return PTR_ERR(root); + } if (!group->cg_item.ci_name) group->cg_item.ci_name = group->cg_item.ci_namebuf; @@ -1872,7 +1936,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) d_add(dentry, NULL); err = configfs_attach_group(sd->s_element, &group->cg_item, - dentry); + dentry, frag); if (err) { BUG_ON(d_inode(dentry)); d_drop(dentry); @@ -1890,6 +1954,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) unlink_group(group); configfs_release_fs(); } + put_fragment(frag); return err; } @@ -1899,12 +1964,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) struct config_group *group = &subsys->su_group; struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *root = dentry->d_sb->s_root; + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_fragment *frag = sd->s_frag; if (dentry->d_parent != root) { pr_err("Tried to unregister non-subsystem!\n"); return; } + down_write(&frag->frag_sem); + frag->frag_dead = true; + up_write(&frag->frag_sem); + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 61e4db4390a1..fb65b706cc0d 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -39,40 +39,44 @@ struct configfs_buffer { bool write_in_progress; char *bin_buffer; int bin_buffer_size; + int cb_max_size; + struct config_item *item; + struct module *owner; + union { + struct configfs_attribute *attr; + struct configfs_bin_attribute *bin_attr; + }; }; +static inline struct configfs_fragment *to_frag(struct file *file) +{ + struct configfs_dirent *sd = file->f_path.dentry->d_fsdata; -/** - * fill_read_buffer - allocate and fill buffer from item. - * @dentry: dentry pointer. - * @buffer: data buffer for file. - * - * Allocate @buffer->page, if it hasn't been already, then call the - * config_item's show() method to fill the buffer with this attribute's - * data. - * This is called only once, on the file's first read. - */ -static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer) + return sd->s_frag; +} + +static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer) { - struct configfs_attribute * attr = to_attr(dentry); - struct config_item * item = to_item(dentry->d_parent); - int ret = 0; - ssize_t count; + struct configfs_fragment *frag = to_frag(file); + ssize_t count = -ENOENT; if (!buffer->page) buffer->page = (char *) get_zeroed_page(GFP_KERNEL); if (!buffer->page) return -ENOMEM; - count = attr->show(item, buffer->page); - - BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); - if (count >= 0) { - buffer->needs_read_fill = 0; - buffer->count = count; - } else - ret = count; - return ret; + down_read(&frag->frag_sem); + if (!frag->frag_dead) + count = buffer->attr->show(buffer->item, buffer->page); + up_read(&frag->frag_sem); + + if (count < 0) + return count; + if (WARN_ON_ONCE(count > (ssize_t)SIMPLE_ATTR_SIZE)) + return -EIO; + buffer->needs_read_fill = 0; + buffer->count = count; + return 0; } /** @@ -97,12 +101,13 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf static ssize_t configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct configfs_buffer * buffer = file->private_data; + struct configfs_buffer *buffer = file->private_data; ssize_t retval = 0; mutex_lock(&buffer->mutex); if (buffer->needs_read_fill) { - if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) + retval = fill_read_buffer(file, buffer); + if (retval) goto out; } pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", @@ -138,10 +143,8 @@ static ssize_t configfs_read_bin_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + struct configfs_fragment *frag = to_frag(file); struct configfs_buffer *buffer = file->private_data; - struct dentry *dentry = file->f_path.dentry; - struct config_item *item = to_item(dentry->d_parent); - struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); ssize_t retval = 0; ssize_t len = min_t(size_t, count, PAGE_SIZE); @@ -156,14 +159,19 @@ configfs_read_bin_file(struct file *file, char __user *buf, if (buffer->needs_read_fill) { /* perform first read with buf == NULL to get extent */ - len = bin_attr->read(item, NULL, 0); + down_read(&frag->frag_sem); + if (!frag->frag_dead) + len = buffer->bin_attr->read(buffer->item, NULL, 0); + else + len = -ENOENT; + up_read(&frag->frag_sem); if (len <= 0) { retval = len; goto out; } /* do not exceed the maximum value */ - if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) { + if (buffer->cb_max_size && len > buffer->cb_max_size) { retval = -EFBIG; goto out; } @@ -176,7 +184,13 @@ configfs_read_bin_file(struct file *file, char __user *buf, buffer->bin_buffer_size = len; /* perform second read to fill buffer */ - len = bin_attr->read(item, buffer->bin_buffer, len); + down_read(&frag->frag_sem); + if (!frag->frag_dead) + len = buffer->bin_attr->read(buffer->item, + buffer->bin_buffer, len); + else + len = -ENOENT; + up_read(&frag->frag_sem); if (len < 0) { retval = len; vfree(buffer->bin_buffer); @@ -226,25 +240,17 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size return error ? -EFAULT : count; } - -/** - * flush_write_buffer - push buffer to config_item. - * @dentry: dentry to the attribute - * @buffer: data buffer for file. - * @count: number of bytes - * - * Get the correct pointers for the config_item and the attribute we're - * dealing with, then call the store() method for the attribute, - * passing the buffer that we acquired in fill_write_buffer(). - */ - static int -flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count) +flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t count) { - struct configfs_attribute * attr = to_attr(dentry); - struct config_item * item = to_item(dentry->d_parent); - - return attr->store(item, buffer->page, count); + struct configfs_fragment *frag = to_frag(file); + int res = -ENOENT; + + down_read(&frag->frag_sem); + if (!frag->frag_dead) + res = buffer->attr->store(buffer->item, buffer->page, count); + up_read(&frag->frag_sem); + return res; } @@ -268,13 +274,13 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size static ssize_t configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct configfs_buffer * buffer = file->private_data; + struct configfs_buffer *buffer = file->private_data; ssize_t len; mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, len); + len = flush_write_buffer(file, buffer, len); if (len > 0) *ppos += len; mutex_unlock(&buffer->mutex); @@ -299,8 +305,6 @@ configfs_write_bin_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct configfs_buffer *buffer = file->private_data; - struct dentry *dentry = file->f_path.dentry; - struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); void *tbuf = NULL; ssize_t len; @@ -316,8 +320,8 @@ configfs_write_bin_file(struct file *file, const char __user *buf, /* buffer grows? */ if (*ppos + count > buffer->bin_buffer_size) { - if (bin_attr->cb_max_size && - *ppos + count > bin_attr->cb_max_size) { + if (buffer->cb_max_size && + *ppos + count > buffer->cb_max_size) { len = -EFBIG; goto out; } @@ -349,31 +353,51 @@ out: return len; } -static int check_perm(struct inode * inode, struct file * file, int type) +static int __configfs_open_file(struct inode *inode, struct file *file, int type) { - struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent); - struct configfs_attribute * attr = to_attr(file->f_path.dentry); - struct configfs_bin_attribute *bin_attr = NULL; - struct configfs_buffer * buffer; - struct configfs_item_operations * ops = NULL; - int error = 0; + struct dentry *dentry = file->f_path.dentry; + struct configfs_fragment *frag = to_frag(file); + struct configfs_attribute *attr; + struct configfs_buffer *buffer; + int error; - if (!item || !attr) - goto Einval; + error = -ENOMEM; + buffer = kzalloc(sizeof(struct configfs_buffer), GFP_KERNEL); + if (!buffer) + goto out; - if (type & CONFIGFS_ITEM_BIN_ATTR) - bin_attr = to_bin_attr(file->f_path.dentry); + error = -ENOENT; + down_read(&frag->frag_sem); + if (unlikely(frag->frag_dead)) + goto out_free_buffer; - /* Grab the module reference for this attribute if we have one */ - if (!try_module_get(attr->ca_owner)) { - error = -ENODEV; - goto Done; + error = -EINVAL; + buffer->item = to_item(dentry->d_parent); + if (!buffer->item) + goto out_free_buffer; + + attr = to_attr(dentry); + if (!attr) + goto out_put_item; + + if (type & CONFIGFS_ITEM_BIN_ATTR) { + buffer->bin_attr = to_bin_attr(dentry); + buffer->cb_max_size = buffer->bin_attr->cb_max_size; + } else { + buffer->attr = attr; } - if (item->ci_type) - ops = item->ci_type->ct_item_ops; - else - goto Eaccess; + buffer->owner = attr->ca_owner; + /* Grab the module reference for this attribute if we have one */ + error = -ENODEV; + if (!try_module_get(buffer->owner)) + goto out_put_item; + + error = -EACCES; + if (!buffer->item->ci_type) + goto out_put_module; + + buffer->ops = buffer->item->ci_type->ct_item_ops; /* File needs write support. * The inode's perms must say it's ok, @@ -381,13 +405,11 @@ static int check_perm(struct inode * inode, struct file * file, int type) */ if (file->f_mode & FMODE_WRITE) { if (!(inode->i_mode & S_IWUGO)) - goto Eaccess; - + goto out_put_module; if ((type & CONFIGFS_ITEM_ATTR) && !attr->store) - goto Eaccess; - - if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write) - goto Eaccess; + goto out_put_module; + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->write) + goto out_put_module; } /* File needs read support. @@ -396,92 +418,72 @@ static int check_perm(struct inode * inode, struct file * file, int type) */ if (file->f_mode & FMODE_READ) { if (!(inode->i_mode & S_IRUGO)) - goto Eaccess; - + goto out_put_module; if ((type & CONFIGFS_ITEM_ATTR) && !attr->show) - goto Eaccess; - - if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read) - goto Eaccess; + goto out_put_module; + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->read) + goto out_put_module; } - /* No error? Great, allocate a buffer for the file, and store it - * it in file->private_data for easy access. - */ - buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL); - if (!buffer) { - error = -ENOMEM; - goto Enomem; - } mutex_init(&buffer->mutex); buffer->needs_read_fill = 1; buffer->read_in_progress = false; buffer->write_in_progress = false; - buffer->ops = ops; file->private_data = buffer; - goto Done; + up_read(&frag->frag_sem); + return 0; - Einval: - error = -EINVAL; - goto Done; - Eaccess: - error = -EACCES; - Enomem: - module_put(attr->ca_owner); - Done: - if (error && item) - config_item_put(item); +out_put_module: + module_put(buffer->owner); +out_put_item: + config_item_put(buffer->item); +out_free_buffer: + up_read(&frag->frag_sem); + kfree(buffer); +out: return error; } static int configfs_release(struct inode *inode, struct file *filp) { - struct config_item * item = to_item(filp->f_path.dentry->d_parent); - struct configfs_attribute * attr = to_attr(filp->f_path.dentry); - struct module * owner = attr->ca_owner; - struct configfs_buffer * buffer = filp->private_data; - - if (item) - config_item_put(item); - /* After this point, attr should not be accessed. */ - module_put(owner); - - if (buffer) { - if (buffer->page) - free_page((unsigned long)buffer->page); - mutex_destroy(&buffer->mutex); - kfree(buffer); - } + struct configfs_buffer *buffer = filp->private_data; + + module_put(buffer->owner); + if (buffer->page) + free_page((unsigned long)buffer->page); + mutex_destroy(&buffer->mutex); + kfree(buffer); return 0; } static int configfs_open_file(struct inode *inode, struct file *filp) { - return check_perm(inode, filp, CONFIGFS_ITEM_ATTR); + return __configfs_open_file(inode, filp, CONFIGFS_ITEM_ATTR); } static int configfs_open_bin_file(struct inode *inode, struct file *filp) { - return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR); + return __configfs_open_file(inode, filp, CONFIGFS_ITEM_BIN_ATTR); } -static int configfs_release_bin_file(struct inode *inode, struct file *filp) +static int configfs_release_bin_file(struct inode *inode, struct file *file) { - struct configfs_buffer *buffer = filp->private_data; - struct dentry *dentry = filp->f_path.dentry; - struct config_item *item = to_item(dentry->d_parent); - struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); - ssize_t len = 0; - int ret; + struct configfs_buffer *buffer = file->private_data; buffer->read_in_progress = false; if (buffer->write_in_progress) { + struct configfs_fragment *frag = to_frag(file); buffer->write_in_progress = false; - len = bin_attr->write(item, buffer->bin_buffer, - buffer->bin_buffer_size); - + down_read(&frag->frag_sem); + if (!frag->frag_dead) { + /* result of ->release() is ignored */ + buffer->bin_attr->write(buffer->item, + buffer->bin_buffer, + buffer->bin_buffer_size); + } + up_read(&frag->frag_sem); /* vfree on NULL is safe */ vfree(buffer->bin_buffer); buffer->bin_buffer = NULL; @@ -489,10 +491,8 @@ static int configfs_release_bin_file(struct inode *inode, struct file *filp) buffer->needs_read_fill = 1; } - ret = configfs_release(inode, filp); - if (len < 0) - return len; - return ret; + configfs_release(inode, file); + return 0; } @@ -527,7 +527,7 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, - CONFIGFS_ITEM_ATTR); + CONFIGFS_ITEM_ATTR, parent_sd->s_frag); inode_unlock(d_inode(dir)); return error; @@ -549,7 +549,7 @@ int configfs_create_bin_file(struct config_item *item, inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, - CONFIGFS_ITEM_BIN_ATTR); + CONFIGFS_ITEM_BIN_ATTR, parent_sd->s_frag); inode_unlock(dir->d_inode); return error; @@ -600,7 +600,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping) * guaranteed to either see new references or prevent new * references from being established. */ - unmap_mapping_range(mapping, 0, 0, 1); + unmap_mapping_range(mapping, 0, 0, 0); xas_lock_irq(&xas); xas_for_each(&xas, entry, ULONG_MAX) { diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index f0e549783caf..09bc68708d28 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -7,7 +7,7 @@ * and for mapping back from file handles to dentries. * * For details on why we do all the strange and hairy things in here - * take a look at Documentation/filesystems/nfs/Exporting. + * take a look at Documentation/filesystems/nfs/exporting.rst. */ #include <linux/exportfs.h> #include <linux/fs.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 420fe3deed39..006b7a2070bf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4586,7 +4586,6 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; - struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4675,7 +4674,6 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ - blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4706,7 +4704,6 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); - blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 4df26ef2b2b1..4f8b5fd6c81f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -390,6 +390,19 @@ static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h) return mp->mp_aheight - x - 1; } +static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp) +{ + sector_t factor = 1, block = 0; + int hgt; + + for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) { + if (hgt < mp->mp_aheight) + block += mp->mp_list[hgt] * factor; + factor *= sdp->sd_inptrs; + } + return block; +} + static void release_metapath(struct metapath *mp) { int i; @@ -430,60 +443,84 @@ static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *pt return ptr - first; } -typedef const __be64 *(*gfs2_metadata_walker)( - struct metapath *mp, - const __be64 *start, const __be64 *end, - u64 factor, void *data); +enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE }; -#define WALK_STOP ((__be64 *)0) -#define WALK_NEXT ((__be64 *)1) +/* + * gfs2_metadata_walker - walk an indirect block + * @mp: Metapath to indirect block + * @ptrs: Number of pointers to look at + * + * When returning WALK_FOLLOW, the walker must update @mp to point at the right + * indirect block to follow. + */ +typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp, + unsigned int ptrs); -static int gfs2_walk_metadata(struct inode *inode, sector_t lblock, - u64 len, struct metapath *mp, gfs2_metadata_walker walker, - void *data) +/* + * gfs2_walk_metadata - walk a tree of indirect blocks + * @inode: The inode + * @mp: Starting point of walk + * @max_len: Maximum number of blocks to walk + * @walker: Called during the walk + * + * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or + * past the end of metadata, and a negative error code otherwise. + */ + +static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp, + u64 max_len, gfs2_metadata_walker walker) { - struct metapath clone; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - const __be64 *start, *end, *ptr; u64 factor = 1; unsigned int hgt; - int ret = 0; + int ret; - for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--) + /* + * The walk starts in the lowest allocated indirect block, which may be + * before the position indicated by @mp. Adjust @max_len accordingly + * to avoid a short walk. + */ + for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) { + max_len += mp->mp_list[hgt] * factor; + mp->mp_list[hgt] = 0; factor *= sdp->sd_inptrs; + } for (;;) { - u64 step; + u16 start = mp->mp_list[hgt]; + enum walker_status status; + unsigned int ptrs; + u64 len; /* Walk indirect block. */ - start = metapointer(hgt, mp); - end = metaend(hgt, mp); - - step = (end - start) * factor; - if (step > len) - end = start + DIV_ROUND_UP_ULL(len, factor); - - ptr = walker(mp, start, end, factor, data); - if (ptr == WALK_STOP) + ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start; + len = ptrs * factor; + if (len > max_len) + ptrs = DIV_ROUND_UP_ULL(max_len, factor); + status = walker(mp, ptrs); + switch (status) { + case WALK_STOP: + return 1; + case WALK_FOLLOW: + BUG_ON(mp->mp_aheight == mp->mp_fheight); + ptrs = mp->mp_list[hgt] - start; + len = ptrs * factor; break; - if (step >= len) + case WALK_CONTINUE: break; - len -= step; - if (ptr != WALK_NEXT) { - BUG_ON(!*ptr); - mp->mp_list[hgt] += ptr - start; - goto fill_up_metapath; } + if (len >= max_len) + break; + max_len -= len; + if (status == WALK_FOLLOW) + goto fill_up_metapath; lower_metapath: /* Decrease height of metapath. */ - if (mp != &clone) { - clone_metapath(&clone, mp); - mp = &clone; - } brelse(mp->mp_bh[hgt]); mp->mp_bh[hgt] = NULL; + mp->mp_list[hgt] = 0; if (!hgt) break; hgt--; @@ -491,10 +528,7 @@ lower_metapath: /* Advance in metadata tree. */ (mp->mp_list[hgt])++; - start = metapointer(hgt, mp); - end = metaend(hgt, mp); - if (start >= end) { - mp->mp_list[hgt] = 0; + if (mp->mp_list[hgt] >= sdp->sd_inptrs) { if (!hgt) break; goto lower_metapath; @@ -502,44 +536,36 @@ lower_metapath: fill_up_metapath: /* Increase height of metapath. */ - if (mp != &clone) { - clone_metapath(&clone, mp); - mp = &clone; - } ret = fillup_metapath(ip, mp, ip->i_height - 1); if (ret < 0) - break; + return ret; hgt += ret; for (; ret; ret--) do_div(factor, sdp->sd_inptrs); mp->mp_aheight = hgt + 1; } - if (mp == &clone) - release_metapath(mp); - return ret; + return 0; } -struct gfs2_hole_walker_args { - u64 blocks; -}; - -static const __be64 *gfs2_hole_walker(struct metapath *mp, - const __be64 *start, const __be64 *end, - u64 factor, void *data) +static enum walker_status gfs2_hole_walker(struct metapath *mp, + unsigned int ptrs) { - struct gfs2_hole_walker_args *args = data; - const __be64 *ptr; + const __be64 *start, *ptr, *end; + unsigned int hgt; + + hgt = mp->mp_aheight - 1; + start = metapointer(hgt, mp); + end = start + ptrs; for (ptr = start; ptr < end; ptr++) { if (*ptr) { - args->blocks += (ptr - start) * factor; + mp->mp_list[hgt] += ptr - start; if (mp->mp_aheight == mp->mp_fheight) return WALK_STOP; - return ptr; /* increase height */ + return WALK_FOLLOW; } } - args->blocks += (end - start) * factor; - return WALK_NEXT; + return WALK_CONTINUE; } /** @@ -557,12 +583,24 @@ static const __be64 *gfs2_hole_walker(struct metapath *mp, static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len, struct metapath *mp, struct iomap *iomap) { - struct gfs2_hole_walker_args args = { }; - int ret = 0; + struct metapath clone; + u64 hole_size; + int ret; - ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args); - if (!ret) - iomap->length = args.blocks << inode->i_blkbits; + clone_metapath(&clone, mp); + ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker); + if (ret < 0) + goto out; + + if (ret == 1) + hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock; + else + hole_size = len; + iomap->length = hole_size << inode->i_blkbits; + ret = 0; + +out: + release_metapath(&clone); return ret; } diff --git a/fs/io_uring.c b/fs/io_uring.c index d542f1cf4428..0dadbdbead0f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -75,7 +75,7 @@ #include "internal.h" -#define IORING_MAX_ENTRIES 4096 +#define IORING_MAX_ENTRIES 32768 #define IORING_MAX_FIXED_FILES 1024 struct io_uring { @@ -84,27 +84,29 @@ struct io_uring { }; /* - * This data is shared with the application through the mmap at offset - * IORING_OFF_SQ_RING. + * This data is shared with the application through the mmap at offsets + * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. * * The offsets to the member fields are published through struct * io_sqring_offsets when calling io_uring_setup. */ -struct io_sq_ring { +struct io_rings { /* * Head and tail offsets into the ring; the offsets need to be * masked to get valid indices. * - * The kernel controls head and the application controls tail. + * The kernel controls head of the sq ring and the tail of the cq ring, + * and the application controls tail of the sq ring and the head of the + * cq ring. */ - struct io_uring r; + struct io_uring sq, cq; /* - * Bitmask to apply to head and tail offsets (constant, equals + * Bitmasks to apply to head and tail offsets (constant, equals * ring_entries - 1) */ - u32 ring_mask; - /* Ring size (constant, power of 2) */ - u32 ring_entries; + u32 sq_ring_mask, cq_ring_mask; + /* Ring sizes (constant, power of 2) */ + u32 sq_ring_entries, cq_ring_entries; /* * Number of invalid entries dropped by the kernel due to * invalid index stored in array @@ -117,7 +119,7 @@ struct io_sq_ring { * counter includes all submissions that were dropped reaching * the new SQ head (and possibly more). */ - u32 dropped; + u32 sq_dropped; /* * Runtime flags * @@ -127,43 +129,7 @@ struct io_sq_ring { * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ - u32 flags; - /* - * Ring buffer of indices into array of io_uring_sqe, which is - * mmapped by the application using the IORING_OFF_SQES offset. - * - * This indirection could e.g. be used to assign fixed - * io_uring_sqe entries to operations and only submit them to - * the queue when needed. - * - * The kernel modifies neither the indices array nor the entries - * array. - */ - u32 array[]; -}; - -/* - * This data is shared with the application through the mmap at offset - * IORING_OFF_CQ_RING. - * - * The offsets to the member fields are published through struct - * io_cqring_offsets when calling io_uring_setup. - */ -struct io_cq_ring { - /* - * Head and tail offsets into the ring; the offsets need to be - * masked to get valid indices. - * - * The application controls head and the kernel tail. - */ - struct io_uring r; - /* - * Bitmask to apply to head and tail offsets (constant, equals - * ring_entries - 1) - */ - u32 ring_mask; - /* Ring size (constant, power of 2) */ - u32 ring_entries; + u32 sq_flags; /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure @@ -177,7 +143,7 @@ struct io_cq_ring { * As completion events come in out of order this counter is not * ordered with any other data. */ - u32 overflow; + u32 cq_overflow; /* * Ring buffer of completion events. * @@ -185,7 +151,7 @@ struct io_cq_ring { * produced, so the application is allowed to modify pending * entries. */ - struct io_uring_cqe cqes[]; + struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; struct io_mapped_ubuf { @@ -201,7 +167,7 @@ struct async_list { struct list_head list; struct file *file; - off_t io_end; + off_t io_start; size_t io_len; }; @@ -215,8 +181,18 @@ struct io_ring_ctx { bool compat; bool account_mem; - /* SQ ring */ - struct io_sq_ring *sq_ring; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ + u32 *sq_array; unsigned cached_sq_head; unsigned sq_entries; unsigned sq_mask; @@ -227,15 +203,13 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* IO offload */ - struct workqueue_struct *sqo_wq; + struct workqueue_struct *sqo_wq[2]; struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; struct completion sqo_thread_started; struct { - /* CQ ring */ - struct io_cq_ring *cq_ring; unsigned cached_cq_tail; unsigned cq_entries; unsigned cq_mask; @@ -244,6 +218,8 @@ struct io_ring_ctx { struct eventfd_ctx *cq_ev_fd; } ____cacheline_aligned_in_smp; + struct io_rings *rings; + /* * If used, fixed file set. Writers must ensure that ->refs is dead, * readers must ensure that ->refs is alive as long as the file* is @@ -288,6 +264,7 @@ struct io_ring_ctx { struct sqe_submit { const struct io_uring_sqe *sqe; unsigned short index; + u32 sequence; bool has_user; bool needs_lock; bool needs_fixed_file; @@ -335,6 +312,7 @@ struct io_kiocb { #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ +#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ u64 user_data; u32 result; u32 sequence; @@ -366,6 +344,7 @@ struct io_submit_state { }; static void io_sq_wq_submit_work(struct work_struct *work); +static void __io_free_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -430,7 +409,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx, if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped; + return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) @@ -451,11 +430,11 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) static void __io_commit_cqring(struct io_ring_ctx *ctx) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; - if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) { + if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) { /* order cqe stores with ring update */ - smp_store_release(&ring->r.tail, ctx->cached_cq_tail); + smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); @@ -464,6 +443,24 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } +static inline void io_queue_async_work(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + int rw; + + switch (req->submit.sqe->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + rw = !(req->rw.ki_flags & IOCB_DIRECT); + break; + default: + rw = 0; + break; + } + + queue_work(ctx->sqo_wq[rw], &req->work); +} + static void io_commit_cqring(struct io_ring_ctx *ctx) { struct io_kiocb *req; @@ -471,14 +468,19 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { + if (req->flags & REQ_F_SHADOW_DRAIN) { + /* Just for drain, free it. */ + __io_free_req(req); + continue; + } req->flags |= REQ_F_IO_DRAINED; - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; unsigned tail; tail = ctx->cached_cq_tail; @@ -487,11 +489,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ - if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) + if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries) return NULL; ctx->cached_cq_tail++; - return &ring->cqes[tail & ctx->cq_mask]; + return &rings->cqes[tail & ctx->cq_mask]; } static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, @@ -510,9 +512,9 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); } else { - unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); + unsigned overflow = READ_ONCE(ctx->rings->cq_overflow); - WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); + WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1); } } @@ -635,7 +637,7 @@ static void io_req_link_next(struct io_kiocb *req) nxt->flags |= REQ_F_LINK_DONE; INIT_WORK(&nxt->work, io_sq_wq_submit_work); - queue_work(req->ctx->sqo_wq, &nxt->work); + io_queue_async_work(req->ctx, nxt); } } @@ -679,6 +681,13 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static unsigned io_cqring_events(struct io_rings *rings) +{ + /* See comment at the top of this file */ + smp_rmb(); + return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); +} + /* * Find and free completed poll iocbs */ @@ -771,7 +780,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { - while (!list_empty(&ctx->poll_list)) { + while (!list_empty(&ctx->poll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); @@ -798,6 +807,12 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) unsigned int nr_events = 0; io_iopoll_getevents(ctx, &nr_events, 1); + + /* + * Ensure we allow local-to-the-cpu processing to take place, + * in this case we need to ensure that we reap all events. + */ + cond_resched(); } mutex_unlock(&ctx->uring_lock); } @@ -805,11 +820,42 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, long min) { - int ret = 0; + int iters, ret = 0; + /* + * We disallow the app entering submit/complete with polling, but we + * still need to lock the ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + + iters = 0; do { int tmin = 0; + /* + * Don't enter poll loop if we already have events pending. + * If we do, we can potentially be spinning for commands that + * already triggered a CQE (eg in error). + */ + if (io_cqring_events(ctx->rings)) + break; + + /* + * If a submit got punted to a workqueue, we can have the + * application entering polling for a command before it gets + * issued. That app will hold the uring_lock for the duration + * of the poll right here, so we need to take a breather every + * now and then to ensure that the issue has a chance to add + * the poll to the issued list. Otherwise we can spin here + * forever, while the workqueue is stuck trying to acquire the + * very same mutex. + */ + if (!(++iters & 7)) { + mutex_unlock(&ctx->uring_lock); + mutex_lock(&ctx->uring_lock); + } + if (*nr_events < min) tmin = min - *nr_events; @@ -819,6 +865,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ret = 0; } while (min && !*nr_events && !need_resched()); + mutex_unlock(&ctx->uring_lock); return ret; } @@ -1097,10 +1144,8 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, iter->bvec = bvec + seg_skip; iter->nr_segs -= seg_skip; - iter->count -= (seg_skip << PAGE_SHIFT); + iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ~PAGE_MASK; - if (iter->iov_offset) - iter->count -= iter->iov_offset; } } @@ -1144,6 +1189,28 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } +static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb) +{ + if (al->file == kiocb->ki_filp) { + off_t start, end; + + /* + * Allow merging if we're anywhere in the range of the same + * page. Generally this happens for sub-page reads or writes, + * and it's beneficial to allow the first worker to bring the + * page in and the piggy backed work can then work on the + * cached page. + */ + start = al->io_start & PAGE_MASK; + end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK; + if (kiocb->ki_pos >= start && kiocb->ki_pos <= end) + return true; + } + + al->file = NULL; + return false; +} + /* * Make a note of the last file/offset/direction we punted to async * context. We'll use this information to see if we can piggy back a @@ -1155,9 +1222,8 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) struct async_list *async_list = &req->ctx->pending_async[rw]; struct kiocb *kiocb = &req->rw; struct file *filp = kiocb->ki_filp; - off_t io_end = kiocb->ki_pos + len; - if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) { + if (io_should_merge(async_list, kiocb)) { unsigned long max_bytes; /* Use 8x RA size as a decent limiter for both reads/writes */ @@ -1170,17 +1236,16 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) req->flags |= REQ_F_SEQ_PREV; async_list->io_len += len; } else { - io_end = 0; - async_list->io_len = 0; + async_list->file = NULL; } } /* New file? Reset state. */ if (async_list->file != filp) { - async_list->io_len = 0; + async_list->io_start = kiocb->ki_pos; + async_list->io_len = len; async_list->file = filp; } - async_list->io_end = io_end; } static int io_read(struct io_kiocb *req, const struct sqe_submit *s, @@ -1492,7 +1557,7 @@ static void io_poll_remove_one(struct io_kiocb *req) WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); - queue_work(req->ctx->sqo_wq, &req->work); + io_queue_async_work(req->ctx, req); } spin_unlock(&poll->head->lock); @@ -1606,7 +1671,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, io_cqring_ev_posted(ctx); io_put_req(req); } else { - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } return 1; @@ -1949,7 +2014,7 @@ out: */ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) { - bool ret = false; + bool ret; if (!list) return false; @@ -1995,10 +2060,14 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, flags = READ_ONCE(s->sqe->flags); fd = READ_ONCE(s->sqe->fd); - if (flags & IOSQE_IO_DRAIN) { + if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - req->sequence = ctx->cached_sq_head - 1; - } + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = s->sequence; if (!io_op_needs_file(s->sqe)) return 0; @@ -2020,12 +2089,12 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, return 0; } -static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s) +static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, bool force_nonblock) { int ret; - ret = __io_submit_sqe(ctx, req, s, true); + ret = __io_submit_sqe(ctx, req, s, force_nonblock); if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; @@ -2042,7 +2111,7 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (list) atomic_inc(&list->cnt); INIT_WORK(&req->work, io_sq_wq_submit_work); - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } /* @@ -2067,10 +2136,70 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return ret; } +static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, bool force_nonblock) +{ + int ret; + + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_free_req(req); + io_cqring_add_event(ctx, s->sqe->user_data, ret); + } + return 0; + } + + return __io_queue_sqe(ctx, req, s, force_nonblock); +} + +static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, struct io_kiocb *shadow, + bool force_nonblock) +{ + int ret; + int need_submit = false; + + if (!shadow) + return io_queue_sqe(ctx, req, s, force_nonblock); + + /* + * Mark the first IO in link list as DRAIN, let all the following + * IOs enter the defer list. all IO needs to be completed before link + * list. + */ + req->flags |= REQ_F_IO_DRAIN; + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_free_req(req); + io_cqring_add_event(ctx, s->sqe->user_data, ret); + return 0; + } + } else { + /* + * If ret == 0 means that all IOs in front of link io are + * running done. let's queue link head. + */ + need_submit = true; + } + + /* Insert shadow req to defer_list, blocking next IOs */ + spin_lock_irq(&ctx->completion_lock); + list_add_tail(&shadow->list, &ctx->defer_list); + spin_unlock_irq(&ctx->completion_lock); + + if (need_submit) + return __io_queue_sqe(ctx, req, s, force_nonblock); + + return 0; +} + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, - struct io_submit_state *state, struct io_kiocb **link) + struct io_submit_state *state, struct io_kiocb **link, + bool force_nonblock) { struct io_uring_sqe *sqe_copy; struct io_kiocb *req; @@ -2097,13 +2226,6 @@ err: return; } - ret = io_req_defer(ctx, req, s->sqe); - if (ret) { - if (ret != -EIOCBQUEUED) - goto err_req; - return; - } - /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but @@ -2130,7 +2252,7 @@ err: INIT_LIST_HEAD(&req->link_list); *link = req; } else { - io_queue_sqe(ctx, req, s); + io_queue_sqe(ctx, req, s, force_nonblock); } } @@ -2160,15 +2282,15 @@ static void io_submit_state_start(struct io_submit_state *state, static void io_commit_sqring(struct io_ring_ctx *ctx) { - struct io_sq_ring *ring = ctx->sq_ring; + struct io_rings *rings = ctx->rings; - if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) { + if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) { /* * Ensure any loads from the SQEs are done at this point, * since once we write the new head, the application could * write new data to them. */ - smp_store_release(&ring->r.head, ctx->cached_sq_head); + smp_store_release(&rings->sq.head, ctx->cached_sq_head); } } @@ -2182,7 +2304,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) */ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) { - struct io_sq_ring *ring = ctx->sq_ring; + struct io_rings *rings = ctx->rings; + u32 *sq_array = ctx->sq_array; unsigned head; /* @@ -2195,20 +2318,21 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) */ head = ctx->cached_sq_head; /* make sure SQ entry isn't read before tail */ - if (head == smp_load_acquire(&ring->r.tail)) + if (head == smp_load_acquire(&rings->sq.tail)) return false; - head = READ_ONCE(ring->array[head & ctx->sq_mask]); + head = READ_ONCE(sq_array[head & ctx->sq_mask]); if (head < ctx->sq_entries) { s->index = head; s->sqe = &ctx->sq_sqes[head]; + s->sequence = ctx->cached_sq_head; ctx->cached_sq_head++; return true; } /* drop invalid entries */ ctx->cached_sq_head++; - ring->dropped++; + rings->sq_dropped++; return false; } @@ -2217,6 +2341,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; + struct io_kiocb *shadow_req = NULL; bool prev_was_link = false; int i, submitted = 0; @@ -2231,11 +2356,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, + true); link = NULL; } prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; + if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { + if (!shadow_req) { + shadow_req = io_get_req(ctx, NULL); + shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); + refcount_dec(&shadow_req->refs); + } + shadow_req->sequence = sqes[i].sequence; + } + if (unlikely(mm_fault)) { io_cqring_add_event(ctx, sqes[i].sqe->user_data, -EFAULT); @@ -2243,13 +2378,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, sqes[i].has_user = has_user; sqes[i].needs_lock = true; sqes[i].needs_fixed_file = true; - io_submit_sqe(ctx, &sqes[i], statep, &link); + io_submit_sqe(ctx, &sqes[i], statep, &link, true); submitted++; } } if (link) - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, true); if (statep) io_submit_state_end(&state); @@ -2280,15 +2415,7 @@ static int io_sq_thread(void *data) unsigned nr_events = 0; if (ctx->flags & IORING_SETUP_IOPOLL) { - /* - * We disallow the app entering submit/complete - * with polling, but we still need to lock the - * ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); io_iopoll_check(ctx, &nr_events, 0); - mutex_unlock(&ctx->uring_lock); } else { /* * Normal IO, just pretend everything completed. @@ -2329,7 +2456,7 @@ static int io_sq_thread(void *data) TASK_INTERRUPTIBLE); /* Tell userspace we may need a wakeup call */ - ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; /* make sure to read SQ tail after writing flags */ smp_mb(); @@ -2343,12 +2470,12 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); - ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; continue; } finish_wait(&ctx->sqo_wait, &wait); - ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } i = 0; @@ -2389,10 +2516,12 @@ static int io_sq_thread(void *data) return 0; } -static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, + bool block_for_last) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; + struct io_kiocb *shadow_req = NULL; bool prev_was_link = false; int i, submit = 0; @@ -2402,6 +2531,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) } for (i = 0; i < to_submit; i++) { + bool force_nonblock = true; struct sqe_submit s; if (!io_get_sqring(ctx, &s)) @@ -2412,34 +2542,49 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, + force_nonblock); link = NULL; } prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; + if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { + if (!shadow_req) { + shadow_req = io_get_req(ctx, NULL); + shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); + refcount_dec(&shadow_req->refs); + } + shadow_req->sequence = s.sequence; + } + s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; submit++; - io_submit_sqe(ctx, &s, statep, &link); + + /* + * The caller will block for events after submit, submit the + * last IO non-blocking. This is either the only IO it's + * submitting, or it already submitted the previous ones. This + * improves performance by avoiding an async punt that we don't + * need to do. + */ + if (block_for_last && submit == to_submit) + force_nonblock = false; + + io_submit_sqe(ctx, &s, statep, &link, force_nonblock); } io_commit_sqring(ctx); if (link) - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req, + block_for_last); if (statep) io_submit_state_end(statep); return submit; } -static unsigned io_cqring_events(struct io_cq_ring *ring) -{ - /* See comment at the top of this file */ - smp_rmb(); - return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); -} - /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -2447,10 +2592,10 @@ static unsigned io_cqring_events(struct io_cq_ring *ring) static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; int ret; - if (io_cqring_events(ring) >= min_events) + if (io_cqring_events(rings) >= min_events) return 0; if (sig) { @@ -2466,12 +2611,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events); + ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events); restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; - return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; + return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -2521,11 +2666,15 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) static void io_finish_async(struct io_ring_ctx *ctx) { + int i; + io_sq_thread_stop(ctx); - if (ctx->sqo_wq) { - destroy_workqueue(ctx->sqo_wq); - ctx->sqo_wq = NULL; + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) { + if (ctx->sqo_wq[i]) { + destroy_workqueue(ctx->sqo_wq[i]); + ctx->sqo_wq[i] = NULL; + } } } @@ -2733,16 +2882,31 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, } /* Do QD, or 2 * CPUS, whatever is smallest */ - ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, + ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq", + WQ_UNBOUND | WQ_FREEZABLE, min(ctx->sq_entries - 1, 2 * num_online_cpus())); - if (!ctx->sqo_wq) { + if (!ctx->sqo_wq[0]) { + ret = -ENOMEM; + goto err; + } + + /* + * This is for buffered writes, where we want to limit the parallelism + * due to file locking in file systems. As "normal" buffered writes + * should parellelize on writeout quite nicely, limit us to having 2 + * pending. This avoids massive contention on the inode when doing + * buffered async writes. + */ + ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq", + WQ_UNBOUND | WQ_FREEZABLE, 2); + if (!ctx->sqo_wq[1]) { ret = -ENOMEM; goto err; } return 0; err: - io_sq_thread_stop(ctx); + io_finish_async(ctx); mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; return ret; @@ -2791,17 +2955,45 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp_flags, get_order(size)); } +static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, + size_t *sq_offset) +{ + struct io_rings *rings; + size_t off, sq_array_size; + + off = struct_size(rings, cqes, cq_entries); + if (off == SIZE_MAX) + return SIZE_MAX; + +#ifdef CONFIG_SMP + off = ALIGN(off, SMP_CACHE_BYTES); + if (off == 0) + return SIZE_MAX; +#endif + + sq_array_size = array_size(sizeof(u32), sq_entries); + if (sq_array_size == SIZE_MAX) + return SIZE_MAX; + + if (check_add_overflow(off, sq_array_size, &off)) + return SIZE_MAX; + + if (sq_offset) + *sq_offset = off; + + return off; +} + static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) { - struct io_sq_ring *sq_ring; - struct io_cq_ring *cq_ring; - size_t bytes; + size_t pages; - bytes = struct_size(sq_ring, array, sq_entries); - bytes += array_size(sizeof(struct io_uring_sqe), sq_entries); - bytes += struct_size(cq_ring, cqes, cq_entries); + pages = (size_t)1 << get_order( + rings_size(sq_entries, cq_entries, NULL)); + pages += (size_t)1 << get_order( + array_size(sizeof(struct io_uring_sqe), sq_entries)); - return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + return pages; } static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) @@ -2815,7 +3007,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) - put_page(imu->bvec[j].bv_page); + put_user_page(imu->bvec[j].bv_page); if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); @@ -2959,10 +3151,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, * if we did partial map, or found file backed vmas, * release any pages we did get */ - if (pret > 0) { - for (j = 0; j < pret; j++) - put_page(pages[j]); - } + if (pret > 0) + put_user_pages(pages, pret); if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); kvfree(imu->bvec); @@ -3048,9 +3238,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) } #endif - io_mem_free(ctx->sq_ring); + io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); - io_mem_free(ctx->cq_ring); percpu_ref_exit(&ctx->refs); if (ctx->account_mem) @@ -3071,10 +3260,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * io_commit_cqring */ smp_rmb(); - if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != - ctx->sq_ring->ring_entries) + if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != + ctx->rings->sq_ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; - if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) + if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -3119,14 +3308,12 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset) { case IORING_OFF_SQ_RING: - ptr = ctx->sq_ring; + case IORING_OFF_CQ_RING: + ptr = ctx->rings; break; case IORING_OFF_SQES: ptr = ctx->sq_sqes; break; - case IORING_OFF_CQ_RING: - ptr = ctx->cq_ring; - break; default: return -EINVAL; } @@ -3169,19 +3356,27 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, * Just return the requested submit count, and wake the thread if * we were asked to. */ + ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sqo_wait); submitted = to_submit; - goto out_ctx; - } + } else if (to_submit) { + bool block_for_last = false; - ret = 0; - if (to_submit) { to_submit = min(to_submit, ctx->sq_entries); + /* + * Allow last submission to block in a series, IFF the caller + * asked to wait for events and we don't currently have + * enough. This potentially avoids an async punt. + */ + if (to_submit == min_complete && + io_cqring_events(ctx->rings) < min_complete) + block_for_last = true; + mutex_lock(&ctx->uring_lock); - submitted = io_ring_submit(ctx, to_submit); + submitted = io_ring_submit(ctx, to_submit, block_for_last); mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { @@ -3190,15 +3385,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, min_complete = min(min_complete, ctx->cq_entries); if (ctx->flags & IORING_SETUP_IOPOLL) { - mutex_lock(&ctx->uring_lock); ret = io_iopoll_check(ctx, &nr_events, min_complete); - mutex_unlock(&ctx->uring_lock); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } } -out_ctx: io_ring_drop_ctx_refs(ctx, 1); out_fput: fdput(f); @@ -3215,19 +3407,27 @@ static const struct file_operations io_uring_fops = { static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_sq_ring *sq_ring; - struct io_cq_ring *cq_ring; - size_t size; + struct io_rings *rings; + size_t size, sq_array_offset; - sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries)); - if (!sq_ring) + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + if (size == SIZE_MAX) + return -EOVERFLOW; + + rings = io_mem_alloc(size); + if (!rings) return -ENOMEM; - ctx->sq_ring = sq_ring; - sq_ring->ring_mask = p->sq_entries - 1; - sq_ring->ring_entries = p->sq_entries; - ctx->sq_mask = sq_ring->ring_mask; - ctx->sq_entries = sq_ring->ring_entries; + ctx->rings = rings; + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); + rings->sq_ring_mask = p->sq_entries - 1; + rings->cq_ring_mask = p->cq_entries - 1; + rings->sq_ring_entries = p->sq_entries; + rings->cq_ring_entries = p->cq_entries; + ctx->sq_mask = rings->sq_ring_mask; + ctx->cq_mask = rings->cq_ring_mask; + ctx->sq_entries = rings->sq_ring_entries; + ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) @@ -3237,15 +3437,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (!ctx->sq_sqes) return -ENOMEM; - cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) - return -ENOMEM; - - ctx->cq_ring = cq_ring; - cq_ring->ring_mask = p->cq_entries - 1; - cq_ring->ring_entries = p->cq_entries; - ctx->cq_mask = cq_ring->ring_mask; - ctx->cq_entries = cq_ring->ring_entries; return 0; } @@ -3349,21 +3540,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; memset(&p->sq_off, 0, sizeof(p->sq_off)); - p->sq_off.head = offsetof(struct io_sq_ring, r.head); - p->sq_off.tail = offsetof(struct io_sq_ring, r.tail); - p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask); - p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries); - p->sq_off.flags = offsetof(struct io_sq_ring, flags); - p->sq_off.dropped = offsetof(struct io_sq_ring, dropped); - p->sq_off.array = offsetof(struct io_sq_ring, array); + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; memset(&p->cq_off, 0, sizeof(p->cq_off)); - p->cq_off.head = offsetof(struct io_cq_ring, r.head); - p->cq_off.tail = offsetof(struct io_cq_ring, r.tail); - p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask); - p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries); - p->cq_off.overflow = offsetof(struct io_cq_ring, overflow); - p->cq_off.cqes = offsetof(struct io_cq_ring, cqes); + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + + p->features = IORING_FEAT_SINGLE_MMAP; return ret; err: io_ring_ctx_wait_and_kill(ctx); diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 85a9093769a9..35768a63fb1d 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -10,7 +10,7 @@ * * The following files are helpful: * - * Documentation/filesystems/nfs/Exporting + * Documentation/filesystems/nfs/exporting.rst * fs/exportfs/expfs.c. */ diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 22a273bd4648..05cb0e8e4382 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -5,7 +5,7 @@ config JFS_FS select CRC32 help This is a port of IBM's Journaled Filesystem . More information is - available in the file <file:Documentation/filesystems/jfs.txt>. + available in the file <file:Documentation/admin-guide/jfs.rst>. If you do not intend to use the JFS filesystem, say N. diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 0ff3facf81da..071b90a45933 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -153,7 +153,7 @@ again: /* Block nfs4_proc_unlck */ mutex_lock(&sp->so_delegreturn_mutex); seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); - err = nfs4_open_delegation_recall(ctx, state, stateid, type); + err = nfs4_open_delegation_recall(ctx, state, stateid); if (!err) err = nfs_delegation_claim_locks(state, stateid); if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) @@ -1046,6 +1046,22 @@ void nfs_test_expired_all_delegations(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } +static void +nfs_delegation_test_free_expired(struct inode *inode, + nfs4_stateid *stateid, + const struct cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops; + int status; + + if (!cred) + return; + status = ops->test_and_free_expired(server, stateid, cred); + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) + nfs_remove_bad_delegation(inode, stateid); +} + /** * nfs_reap_expired_delegations - reap expired delegations * @clp: nfs_client to process @@ -1057,7 +1073,6 @@ void nfs_test_expired_all_delegations(struct nfs_client *clp) */ void nfs_reap_expired_delegations(struct nfs_client *clp) { - const struct nfs4_minor_version_ops *ops = clp->cl_mvops; struct nfs_delegation *delegation; struct nfs_server *server; struct inode *inode; @@ -1088,11 +1103,7 @@ restart: nfs4_stateid_copy(&stateid, &delegation->stateid); clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); rcu_read_unlock(); - if (cred != NULL && - ops->test_and_free_expired(server, &stateid, cred) < 0) { - nfs_revoke_delegation(inode, &stateid); - nfs_inode_find_state_and_recover(inode, &stateid); - } + nfs_delegation_test_free_expired(inode, &stateid, cred); put_cred(cred); if (nfs4_server_rebooted(clp)) { nfs_inode_mark_test_expired_delegation(server,inode); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 5799777df5ec..9eb87ae4c982 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -63,7 +63,7 @@ void nfs_reap_expired_delegations(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync); -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred); bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8d501093660f..0adfd8840110 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1487,7 +1487,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx, if (S_ISREG(file->f_path.dentry->d_inode->i_mode)) nfs_file_set_open_context(file, ctx); else - err = -ESTALE; + err = -EOPENSTALE; out: return err; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0cb442406168..222d7115db71 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -401,15 +401,21 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) unsigned long bytes = 0; struct nfs_direct_req *dreq = hdr->dreq; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) - goto out_put; - spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) dreq->error = hdr->error; - else + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { + spin_unlock(&dreq->lock); + goto out_put; + } + + if (hdr->good_bytes != 0) nfs_direct_good_bytes(dreq, hdr); + if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) + dreq->error = 0; + spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { @@ -782,16 +788,19 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) bool request_commit = false; struct nfs_page *req = nfs_list_entry(hdr->pages.next); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) - goto out_put; - nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) dreq->error = hdr->error; - if (dreq->error == 0) { + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { + spin_unlock(&dreq->lock); + goto out_put; + } + + if (hdr->good_bytes != 0) { nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index b04e20d28162..5657b7f2611f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -8,6 +8,7 @@ */ #include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> #include <linux/nfs_page.h> #include <linux/module.h> #include <linux/sched/mm.h> @@ -928,7 +929,9 @@ retry: pgm = &pgio->pg_mirrors[0]; pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; - pgio->pg_maxretrans = io_maxretrans; + if (NFS_SERVER(pgio->pg_inode)->flags & + (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) + pgio->pg_maxretrans = io_maxretrans; return; out_nolseg: if (pgio->pg_error < 0) @@ -940,6 +943,7 @@ out_mds: pgio->pg_lseg); pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; + pgio->pg_maxretrans = 0; nfs_pageio_reset_read_mds(pgio); } @@ -1000,7 +1004,9 @@ retry: pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; } - pgio->pg_maxretrans = io_maxretrans; + if (NFS_SERVER(pgio->pg_inode)->flags & + (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) + pgio->pg_maxretrans = io_maxretrans; return; out_mds: @@ -1010,6 +1016,7 @@ out_mds: pgio->pg_lseg); pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; + pgio->pg_maxretrans = 0; nfs_pageio_reset_write_mds(pgio); } @@ -1148,8 +1155,6 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, break; case -NFS4ERR_RETRY_UNCACHED_REP: break; - case -EAGAIN: - return -NFS4ERR_RESET_TO_PNFS; /* Invalidate Layout errors */ case -NFS4ERR_PNFS_NO_LAYOUT: case -ESTALE: /* mapped NFS4ERR_STALE */ @@ -1210,7 +1215,6 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EBADHANDLE: case -ELOOP: case -ENOSPC: - case -EAGAIN: break; case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); @@ -1445,16 +1449,6 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) ff_layout_read_prepare_common(task, hdr); } -static void -ff_layout_io_prepare_transmit(struct rpc_task *task, - void *data) -{ - struct nfs_pgio_header *hdr = data; - - if (!pnfs_is_valid_lseg(hdr->lseg)) - rpc_exit(task, -EAGAIN); -} - static void ff_layout_read_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; @@ -1740,7 +1734,6 @@ static void ff_layout_commit_release(void *data) static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1748,7 +1741,6 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1756,7 +1748,6 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, @@ -1764,7 +1755,6 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 53507aa96b0b..3800ab6f08fa 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -114,6 +114,10 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int struct rb_node **p, *parent; int diff; + nfss->fscache_key = NULL; + nfss->fscache = NULL; + if (!(nfss->options & NFS_OPTION_FSCACHE)) + return; if (!uniq) { uniq = ""; ulen = 1; @@ -226,10 +230,11 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) void nfs_fscache_init_inode(struct inode *inode) { struct nfs_fscache_inode_auxdata auxdata; + struct nfs_server *nfss = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); nfsi->fscache = NULL; - if (!S_ISREG(inode->i_mode)) + if (!(nfss->fscache && S_ISREG(inode->i_mode))) return; memset(&auxdata, 0, sizeof(auxdata)); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 25a75e40d91d..ad041cfbf9ec 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -182,7 +182,7 @@ static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) */ static inline const char *nfs_server_fscache_state(struct nfs_server *server) { - if (server->fscache && (server->options & NFS_OPTION_FSCACHE)) + if (server->fscache) return "yes"; return "no "; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8a1758200b57..2a03bfeec10a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1403,12 +1403,22 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) return 0; + if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { + /* Only a mounted-on-fileid? Just exit */ + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + return 0; /* Has the inode gone and changed behind our back? */ - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + } else if (nfsi->fileid != fattr->fileid) { + /* Is this perhaps the mounted-on fileid? */ + if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && + nfsi->fileid == fattr->mounted_on_fileid) + return 0; return -ESTALE; + } if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -ESTALE; + if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) @@ -1768,18 +1778,6 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); -static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, - struct nfs_fattr *fattr) -{ - bool ret1 = true, ret2 = true; - - if (fattr->valid & NFS_ATTR_FATTR_FILEID) - ret1 = (nfsi->fileid == fattr->fileid); - if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) - ret2 = (nfsi->fileid == fattr->mounted_on_fileid); - return ret1 || ret2; -} - /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -1810,7 +1808,16 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - if (!nfs_fileid_valid(nfsi, fattr)) { + if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { + /* Only a mounted-on-fileid? Just exit */ + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + return 0; + /* Has the inode gone and changed behind our back? */ + } else if (nfsi->fileid != fattr->fileid) { + /* Is this perhaps the mounted-on fileid? */ + if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && + nfsi->fileid == fattr->mounted_on_fileid) + return 0; printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a2346a2f8361..e64f810223be 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -775,3 +775,13 @@ static inline bool nfs_error_is_fatal(int err) } } +static inline bool nfs_error_is_fatal_on_server(int err) +{ + switch (err) { + case 0: + case -ERESTARTSYS: + case -EINTR: + return false; + } + return nfs_error_is_fatal(err); +} diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index d778dad9a75e..3564da1ba8a1 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -465,7 +465,8 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, const struct cred *, gfp_t); extern void nfs4_put_state_owner(struct nfs4_state_owner *); -extern void nfs4_purge_state_owners(struct nfs_server *); +extern void nfs4_purge_state_owners(struct nfs_server *, struct list_head *); +extern void nfs4_free_state_owners(struct list_head *head); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct nfs4_state *, fmode_t); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 616393a01c06..da6204025a2d 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -758,9 +758,12 @@ out: static void nfs4_destroy_server(struct nfs_server *server) { + LIST_HEAD(freeme); + nfs_server_return_all_delegations(server); unset_pnfs_layoutdriver(server); - nfs4_purge_state_owners(server); + nfs4_purge_state_owners(server, &freeme); + nfs4_free_state_owners(&freeme); } /* diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 96db471ca2e5..339663d04bf8 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -73,13 +73,13 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (IS_ERR(inode)) { err = PTR_ERR(inode); switch (err) { - case -EPERM: - case -EACCES: - case -EDQUOT: - case -ENOSPC: - case -EROFS: - goto out_put_ctx; default: + goto out_put_ctx; + case -ENOENT: + case -ESTALE: + case -EISDIR: + case -ENOTDIR: + case -ELOOP: goto out_drop; } } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 39896afc6edf..1406858bae6c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1683,6 +1683,14 @@ static void nfs_state_set_open_stateid(struct nfs4_state *state, write_sequnlock(&state->seqlock); } +static void nfs_state_clear_open_state_flags(struct nfs4_state *state) +{ + clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); +} + static void nfs_state_set_delegation(struct nfs4_state *state, const nfs4_stateid *deleg_stateid, fmode_t fmode) @@ -1907,8 +1915,9 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); update: - update_open_stateid(state, &data->o_res.stateid, NULL, - data->o_arg.fmode); + if (!update_open_stateid(state, &data->o_res.stateid, + NULL, data->o_arg.fmode)) + return ERR_PTR(-EAGAIN); refcount_inc(&state->count); return state; @@ -1973,8 +1982,11 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); - update_open_stateid(state, &data->o_res.stateid, NULL, - data->o_arg.fmode); + if (!update_open_stateid(state, &data->o_res.stateid, + NULL, data->o_arg.fmode)) { + nfs4_put_open_state(state); + state = ERR_PTR(-EAGAIN); + } out: nfs_release_seqid(data->o_arg.seqid); return state; @@ -2074,13 +2086,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * { int ret; - /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ - clear_bit(NFS_O_RDWR_STATE, &state->flags); - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_O_RDONLY_STATE, &state->flags); /* memory barrier prior to reading state->n_* */ - clear_bit(NFS_DELEGATED_STATE, &state->flags); - clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE); if (ret != 0) @@ -2156,6 +2162,8 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) return -EAGAIN; + clear_bit(NFS_DELEGATED_STATE, &state->flags); + nfs_state_clear_open_state_flags(state); ret = nfs4_do_open_reclaim(ctx, state); put_nfs_open_context(ctx); return ret; @@ -2171,18 +2179,17 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct case -ENOENT: case -EAGAIN: case -ESTALE: + case -ETIMEDOUT: break; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - set_bit(NFS_DELEGATED_STATE, &state->flags); nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); return -EAGAIN; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: - set_bit(NFS_DELEGATED_STATE, &state->flags); /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); return -EAGAIN; @@ -2203,7 +2210,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: - set_bit(NFS_DELEGATED_STATE, &state->flags); ssleep(1); return -EAGAIN; case -ENOMEM: @@ -2219,8 +2225,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct } int nfs4_open_delegation_recall(struct nfs_open_context *ctx, - struct nfs4_state *state, const nfs4_stateid *stateid, - fmode_t type) + struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_opendata *opendata; @@ -2231,20 +2236,23 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, if (IS_ERR(opendata)) return PTR_ERR(opendata); nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); - nfs_state_clear_delegation(state); - switch (type & (FMODE_READ|FMODE_WRITE)) { - case FMODE_READ|FMODE_WRITE: - case FMODE_WRITE: + if (!test_bit(NFS_O_RDWR_STATE, &state->flags)) { err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE); if (err) - break; + goto out; + } + if (!test_bit(NFS_O_WRONLY_STATE, &state->flags)) { err = nfs4_open_recover_helper(opendata, FMODE_WRITE); if (err) - break; - /* Fall through */ - case FMODE_READ: + goto out; + } + if (!test_bit(NFS_O_RDONLY_STATE, &state->flags)) { err = nfs4_open_recover_helper(opendata, FMODE_READ); + if (err) + goto out; } + nfs_state_clear_delegation(state); +out: nfs4_opendata_put(opendata); return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err); } @@ -2492,6 +2500,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, if (!ctx) { nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 1); data->is_recover = true; + task_setup_data.flags |= RPC_TASK_TIMEOUT; } else { nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 0); pnfs_lgopen_prepare(data, ctx); @@ -2698,6 +2707,7 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st { /* NFSv4.0 doesn't allow for delegation recovery on open expire */ nfs40_clear_delegation_stateid(state); + nfs_state_clear_open_state_flags(state); return nfs4_open_expired(sp, state); } @@ -2740,13 +2750,13 @@ out_free: return -NFS4ERR_EXPIRED; } -static void nfs41_check_delegation_stateid(struct nfs4_state *state) +static int nfs41_check_delegation_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); nfs4_stateid stateid; struct nfs_delegation *delegation; const struct cred *cred = NULL; - int status; + int status, ret = NFS_OK; /* Get the delegation credential for use by test/free_stateid */ rcu_read_lock(); @@ -2754,20 +2764,15 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) if (delegation == NULL) { rcu_read_unlock(); nfs_state_clear_delegation(state); - return; + return NFS_OK; } nfs4_stateid_copy(&stateid, &delegation->stateid); - if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { - rcu_read_unlock(); - nfs_state_clear_delegation(state); - return; - } if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { rcu_read_unlock(); - return; + return NFS_OK; } if (delegation->cred) @@ -2777,9 +2782,24 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) trace_nfs4_test_delegation_stateid(state, NULL, status); if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) nfs_finish_clear_delegation_stateid(state, &stateid); + else + ret = status; - if (delegation->cred) - put_cred(cred); + put_cred(cred); + return ret; +} + +static void nfs41_delegation_recover_stateid(struct nfs4_state *state) +{ + nfs4_stateid tmp; + + if (test_bit(NFS_DELEGATED_STATE, &state->flags) && + nfs4_copy_delegation_stateid(state->inode, state->state, + &tmp, NULL) && + nfs4_stateid_match_other(&state->stateid, &tmp)) + nfs_state_set_delegation(state, &tmp, state->state); + else + nfs_state_clear_delegation(state); } /** @@ -2849,21 +2869,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) const struct cred *cred = state->owner->so_cred; int status; - if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) { - if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) { - if (nfs4_have_delegation(state->inode, state->state)) - return NFS_OK; - return -NFS4ERR_OPENMODE; - } + if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) return -NFS4ERR_BAD_STATEID; - } status = nfs41_test_and_free_expired_stateid(server, stateid, cred); trace_nfs4_test_open_stateid(state, NULL, status); if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) { - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_O_RDWR_STATE, &state->flags); - clear_bit(NFS_OPEN_STATE, &state->flags); + nfs_state_clear_open_state_flags(state); stateid->type = NFS4_INVALID_STATEID_TYPE; return status; } @@ -2876,7 +2887,11 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st { int status; - nfs41_check_delegation_stateid(state); + status = nfs41_check_delegation_stateid(state); + if (status != NFS_OK) + return status; + nfs41_delegation_recover_stateid(state); + status = nfs41_check_expired_locks(state); if (status != NFS_OK) return status; @@ -3201,7 +3216,7 @@ static int _nfs4_do_setattr(struct inode *inode, if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) { /* Use that stateid */ - } else if (ctx != NULL) { + } else if (ctx != NULL && ctx->state) { struct nfs_lock_context *l_ctx; if (!nfs4_valid_open_stateid(ctx->state)) return -EBADF; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9afd051a4876..cad4e064b328 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -624,24 +624,39 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp) /** * nfs4_purge_state_owners - Release all cached state owners * @server: nfs_server with cached state owners to release + * @head: resulting list of state owners * * Called at umount time. Remaining state owners will be on * the LRU with ref count of zero. + * Note that the state owners are not freed, but are added + * to the list @head, which can later be used as an argument + * to nfs4_free_state_owners. */ -void nfs4_purge_state_owners(struct nfs_server *server) +void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head) { struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp, *tmp; - LIST_HEAD(doomed); spin_lock(&clp->cl_lock); list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { - list_move(&sp->so_lru, &doomed); + list_move(&sp->so_lru, head); nfs4_remove_state_owner_locked(sp); } spin_unlock(&clp->cl_lock); +} - list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { +/** + * nfs4_purge_state_owners - Release all cached state owners + * @head: resulting list of state owners + * + * Frees a list of state owners that was generated by + * nfs4_purge_state_owners + */ +void nfs4_free_state_owners(struct list_head *head) +{ + struct nfs4_state_owner *sp, *tmp; + + list_for_each_entry_safe(sp, tmp, head, so_lru) { list_del(&sp->so_lru); nfs4_free_state_owner(sp); } @@ -1463,7 +1478,7 @@ void nfs_inode_find_state_and_recover(struct inode *inode, nfs4_schedule_state_manager(clp); } -static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) +static void nfs4_state_mark_open_context_bad(struct nfs4_state *state, int err) { struct inode *inode = state->inode; struct nfs_inode *nfsi = NFS_I(inode); @@ -1474,6 +1489,8 @@ static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) if (ctx->state != state) continue; set_bit(NFS_CONTEXT_BAD, &ctx->flags); + pr_warn("NFSv4: state recovery failed for open file %pd2, " + "error = %d\n", ctx->dentry, err); } rcu_read_unlock(); } @@ -1481,7 +1498,7 @@ static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) { set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags); - nfs4_state_mark_open_context_bad(state); + nfs4_state_mark_open_context_bad(state, error); } @@ -1512,6 +1529,7 @@ restart: switch (status) { case 0: break; + case -ETIMEDOUT: case -ESTALE: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: @@ -1605,6 +1623,7 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) { struct nfs4_state *state; + unsigned int loop = 0; int status = 0; /* Note: we rely on the sp->so_states list being ordered @@ -1631,8 +1650,10 @@ restart: switch (status) { default: - if (status >= 0) + if (status >= 0) { + loop = 0; break; + } printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status); /* Fall through */ case -ENOENT: @@ -1646,6 +1667,10 @@ restart: break; case -EAGAIN: ssleep(1); + if (loop++ < 10) { + set_bit(ops->state_flag_bit, &state->flags); + break; + } /* Fall through */ case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: @@ -1658,11 +1683,13 @@ restart: case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + /* Fall through */ case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -ETIMEDOUT: goto out_err; } nfs4_put_open_state(state); @@ -1856,12 +1883,13 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov struct nfs4_state_owner *sp; struct nfs_server *server; struct rb_node *pos; + LIST_HEAD(freeme); int status = 0; restart: rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - nfs4_purge_state_owners(server); + nfs4_purge_state_owners(server, &freeme); spin_lock(&clp->cl_lock); for (pos = rb_first(&server->state_owners); pos != NULL; @@ -1890,6 +1918,7 @@ restart: spin_unlock(&clp->cl_lock); } rcu_read_unlock(); + nfs4_free_state_owners(&freeme); return 0; } @@ -1945,7 +1974,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) return -EPERM; case -EACCES: case -NFS4ERR_DELAY: - case -ETIMEDOUT: case -EAGAIN: ssleep(1); break; @@ -2574,7 +2602,7 @@ static void nfs4_state_manager(struct nfs_client *clp) } /* Now recover expired state... */ - if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { section = "reclaim nograce"; status = nfs4_do_reclaim(clp, clp->cl_mvops->nograce_recovery_ops); @@ -2582,6 +2610,7 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; if (status < 0) goto out_error; + clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); } nfs4_end_drain_session(clp); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ed4e1b07447b..20b3717cd7ca 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, } hdr->res.fattr = &hdr->fattr; - hdr->res.count = count; + hdr->res.count = 0; hdr->res.eof = 0; hdr->res.verf = &hdr->verf; nfs_fattr_init(&hdr->fattr); @@ -1251,20 +1251,23 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { - LIST_HEAD(failed); + LIST_HEAD(pages); desc->pg_io_completion = hdr->io_completion; desc->pg_dreq = hdr->dreq; - while (!list_empty(&hdr->pages)) { - struct nfs_page *req = nfs_list_entry(hdr->pages.next); + list_splice_init(&hdr->pages, &pages); + while (!list_empty(&pages)) { + struct nfs_page *req = nfs_list_entry(pages.next); if (!nfs_pageio_add_request(desc, req)) - nfs_list_move_request(req, &failed); + break; } nfs_pageio_complete(desc); - if (!list_empty(&failed)) { - list_move(&failed, &hdr->pages); - return desc->pg_error < 0 ? desc->pg_error : -EIO; + if (!list_empty(&pages)) { + int err = desc->pg_error < 0 ? desc->pg_error : -EIO; + hdr->completion_ops->error_cleanup(&pages, err); + nfs_set_pgio_error(hdr, err, hdr->io_start); + return err; } return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 75bd5b552ba4..4525d5acae38 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1903,12 +1903,6 @@ lookup_again: goto out_unlock; } - if (!nfs4_valid_open_stateid(ctx->state)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, - PNFS_UPDATE_LAYOUT_INVALID_OPEN); - goto out_unlock; - } - /* * Choose a stateid for the LAYOUTGET. If we don't have a layout * stateid, or it has been invalidated, then we must use the open @@ -1939,6 +1933,7 @@ lookup_again: iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ, NULL, &stateid, NULL); if (status != 0) { + lseg = ERR_PTR(status); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_INVALID_OPEN); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index c0046c348910..82af4809b869 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -627,11 +627,16 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_test_and_add_xprt, NULL); - } else - clp = get_v3_ds_connect(mds_srv, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans); + continue; + } + clp = get_v3_ds_connect(mds_srv, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans); + if (IS_ERR(clp)) + continue; + clp->cl_rpcclient->cl_softerr = 0; + clp->cl_rpcclient->cl_softrtry = 0; } if (IS_ERR(clp)) { diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 5552fa8b6e12..0f7288b94633 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -594,7 +594,8 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) /* Emulate the eof flag, which isn't normally needed in NFSv2 * as it is guaranteed to always return the file attributes */ - if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) + if ((hdr->res.count == 0 && hdr->args.count > 0) || + hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) hdr->res.eof = 1; } return 0; @@ -615,8 +616,10 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - if (task->tk_status >= 0) + if (task->tk_status >= 0) { + hdr->res.count = hdr->args.count; nfs_writeback_update_inode(hdr); + } return 0; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index c19841c82b6a..cfe0b586eadd 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -91,19 +91,25 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); -static void nfs_readpage_release(struct nfs_page *req) +static void nfs_readpage_release(struct nfs_page *req, int error) { struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); + struct page *page = req->wb_page; dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), req->wb_bytes, (long long)req_offset(req)); + if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) + SetPageError(page); if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(inode, req->wb_page, 0); + struct address_space *mapping = page_file_mapping(page); - unlock_page(req->wb_page); + if (PageUptodate(page)) + nfs_readpage_to_fscache(inode, page, 0); + else if (!PageError(page) && !PagePrivate(page)) + generic_error_remove_page(mapping, page); + unlock_page(page); } nfs_release_request(req); } @@ -131,7 +137,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, &nfs_async_read_completion_ops); if (!nfs_pageio_add_request(&pgio, new)) { nfs_list_remove_request(new); - nfs_readpage_release(new); + nfs_readpage_release(new, pgio.pg_error); } nfs_pageio_complete(&pgio); @@ -153,6 +159,7 @@ static void nfs_page_group_set_uptodate(struct nfs_page *req) static void nfs_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; + int error; if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) goto out; @@ -179,14 +186,19 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) zero_user_segment(page, start, end); } } + error = 0; bytes += req->wb_bytes; if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (bytes <= hdr->good_bytes) nfs_page_group_set_uptodate(req); + else { + error = hdr->error; + xchg(&nfs_req_openctx(req)->error, error); + } } else nfs_page_group_set_uptodate(req); nfs_list_remove_request(req); - nfs_readpage_release(req); + nfs_readpage_release(req, error); } out: hdr->release(hdr); @@ -213,7 +225,7 @@ nfs_async_read_error(struct list_head *head, int error) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - nfs_readpage_release(req); + nfs_readpage_release(req, error); } } @@ -337,8 +349,13 @@ int nfs_readpage(struct file *file, struct page *page) goto out; } + xchg(&ctx->error, 0); error = nfs_readpage_async(ctx, inode, page); - + if (!error) { + error = wait_on_page_locked_killable(page); + if (!PageUptodate(page) && !error) + error = xchg(&ctx->error, 0); + } out: put_nfs_open_context(ctx); return error; @@ -372,8 +389,8 @@ readpage_async_filler(void *data, struct page *page) zero_user_segment(page, len, PAGE_SIZE); if (!nfs_pageio_add_request(desc->pgio, new)) { nfs_list_remove_request(new); - nfs_readpage_release(new); error = desc->pgio->pg_error; + nfs_readpage_release(new, error); goto out; } return 0; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 628631e2e34f..703f595dce90 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2260,6 +2260,7 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->acdirmin != nfss->acdirmin / HZ || data->acdirmax != nfss->acdirmax / HZ || data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + (data->options & NFS_OPTION_FSCACHE) != (nfss->options & NFS_OPTION_FSCACHE) || data->nfs_server.port != nfss->port || data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 92d9cadc6102..85ca49549b39 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -57,6 +57,7 @@ static const struct rpc_call_ops nfs_commit_ops; static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_inode_remove_request(struct nfs_page *req); static void nfs_clear_request_commit(struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); @@ -591,23 +592,13 @@ release_request: static void nfs_write_error(struct nfs_page *req, int error) { + nfs_set_pageerror(page_file_mapping(req->wb_page)); nfs_mapping_set_error(req->wb_page, error); + nfs_inode_remove_request(req); nfs_end_page_writeback(req); nfs_release_request(req); } -static bool -nfs_error_is_fatal_on_server(int err) -{ - switch (err) { - case 0: - case -ERESTARTSYS: - case -EINTR: - return false; - } - return nfs_error_is_fatal(err); -} - /* * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). @@ -615,7 +606,6 @@ nfs_error_is_fatal_on_server(int err) static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct page *page) { - struct address_space *mapping; struct nfs_page *req; int ret = 0; @@ -630,12 +620,11 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); /* If there is a fatal error that covers this write, just exit */ - ret = 0; - mapping = page_file_mapping(page); - if (test_bit(AS_ENOSPC, &mapping->flags) || - test_bit(AS_EIO, &mapping->flags)) + ret = pgio->pg_error; + if (nfs_error_is_fatal_on_server(ret)) goto out_launder; + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -647,6 +636,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, } else ret = -EAGAIN; nfs_redirty_request(req); + pgio->pg_error = 0; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); @@ -666,7 +656,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, ret = nfs_page_async_flush(pgio, page); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); - ret = 0; + ret = AOP_WRITEPAGE_ACTIVATE; } return ret; } @@ -685,10 +675,11 @@ static int nfs_writepage_locked(struct page *page, nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio); + pgio.pg_error = 0; nfs_pageio_complete(&pgio); if (err < 0) return err; - if (pgio.pg_error < 0) + if (nfs_error_is_fatal(pgio.pg_error)) return pgio.pg_error; return 0; } @@ -698,7 +689,8 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) int ret; ret = nfs_writepage_locked(page, wbc); - unlock_page(page); + if (ret != AOP_WRITEPAGE_ACTIVATE) + unlock_page(page); return ret; } @@ -707,7 +699,8 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int ret; ret = nfs_do_writepage(page, wbc, data); - unlock_page(page); + if (ret != AOP_WRITEPAGE_ACTIVATE) + unlock_page(page); return ret; } @@ -733,13 +726,14 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); + pgio.pg_error = 0; nfs_pageio_complete(&pgio); nfs_io_completion_put(ioc); if (err < 0) goto out_err; err = pgio.pg_error; - if (err < 0) + if (nfs_error_is_fatal(err)) goto out_err; return 0; out_err: diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 26ad75ae2be0..96352ab7bd81 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -571,7 +571,7 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) */ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) { - struct nfsd_net *nn = v; + struct nfsd_net *nn = m->private; seq_printf(m, "max entries: %u\n", nn->max_drc_entries); seq_printf(m, "num entries: %u\n", diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 13c548733860..3cf4f6aa48d6 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1171,13 +1171,17 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) return inode; } -static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl) { struct inode *inode; inode = nfsd_get_inode(dir->i_sb, mode); if (!inode) return -ENOMEM; + if (ncl) { + inode->i_private = ncl; + kref_get(&ncl->cl_ref); + } d_add(dentry, inode); inc_nlink(dir); fsnotify_mkdir(dir, dentry); @@ -1194,17 +1198,14 @@ static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *nc dentry = d_alloc_name(parent, name); if (!dentry) goto out_err; - ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600); + ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl); if (ret) goto out_err; - if (ncl) { - d_inode(dentry)->i_private = ncl; - kref_get(&ncl->cl_ref); - } out: inode_unlock(dir); return dentry; out_err: + dput(dentry); dentry = ERR_PTR(ret); goto out; } @@ -1214,11 +1215,9 @@ static void clear_ncl(struct inode *inode) struct nfsdfs_client *ncl = inode->i_private; inode->i_private = NULL; - synchronize_rcu(); kref_put(&ncl->cl_ref, ncl->cl_release); } - static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode) { struct nfsdfs_client *nc = inode->i_private; @@ -1232,9 +1231,9 @@ struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) { struct nfsdfs_client *nc; - rcu_read_lock(); + inode_lock_shared(inode); nc = __get_nfsdfs_client(inode); - rcu_read_unlock(); + inode_unlock_shared(inode); return nc; } /* from __rpc_unlink */ diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 960f9a3c012d..a5612abc0936 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -555,7 +555,7 @@ static int orangefs_fsync(struct file *file, * Change the file pointer position for an instance of an open file. * * \note If .llseek is overriden, we must acquire lock as described in - * Documentation/filesystems/Locking. + * Documentation/filesystems/locking.rst. * * Future upgrade could support SEEK_DATA and SEEK_HOLE but would * require much changes to the FS diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 572dd29fbd54..34a6c99fa29b 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -246,7 +246,7 @@ struct orangefs_read_options { extern struct orangefs_stats orangefs_stats; /* - * NOTE: See Documentation/filesystems/porting for information + * NOTE: See Documentation/filesystems/porting.rst for information * on implementing FOO_I and properly accessing fs private data */ static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode) diff --git a/fs/read_write.c b/fs/read_write.c index 1f5088dec566..5bbf587f5bc1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1811,10 +1811,7 @@ static int generic_remap_check_len(struct inode *inode_in, return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; } -/* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ +/* Read a page's worth of file data into the page cache. */ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) { struct page *page; @@ -1826,11 +1823,33 @@ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) put_page(page); return ERR_PTR(-EIO); } - lock_page(page); return page; } /* + * Lock two pages, ensuring that we lock in offset order if the pages are from + * the same file. + */ +static void vfs_lock_two_pages(struct page *page1, struct page *page2) +{ + /* Always lock in order of increasing index. */ + if (page1->index > page2->index) + swap(page1, page2); + + lock_page(page1); + if (page1 != page2) + lock_page(page2); +} + +/* Unlock two pages, being careful not to unlock the same page twice. */ +static void vfs_unlock_two_pages(struct page *page1, struct page *page2) +{ + unlock_page(page1); + if (page1 != page2) + unlock_page(page2); +} + +/* * Compare extents of two files to see if they are the same. * Caller must have locked both inodes to prevent write races. */ @@ -1867,10 +1886,24 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, dest_page = vfs_dedupe_get_page(dest, destoff); if (IS_ERR(dest_page)) { error = PTR_ERR(dest_page); - unlock_page(src_page); put_page(src_page); goto out_error; } + + vfs_lock_two_pages(src_page, dest_page); + + /* + * Now that we've locked both pages, make sure they're still + * mapped to the file data we're interested in. If not, + * someone is invalidating pages on us and we lose. + */ + if (!PageUptodate(src_page) || !PageUptodate(dest_page) || + src_page->mapping != src->i_mapping || + dest_page->mapping != dest->i_mapping) { + same = false; + goto unlock; + } + src_addr = kmap_atomic(src_page); dest_addr = kmap_atomic(dest_page); @@ -1882,8 +1915,8 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, kunmap_atomic(dest_addr); kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); +unlock: + vfs_unlock_two_pages(src_page, dest_page); put_page(dest_page); put_page(src_page); diff --git a/fs/seq_file.c b/fs/seq_file.c index 04f09689cd6d..1600034a929b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -119,6 +119,7 @@ static int traverse(struct seq_file *m, loff_t offset) } if (seq_has_overflowed(m)) goto Eoverflow; + p = m->op->next(m, p, &m->index); if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; @@ -126,7 +127,6 @@ static int traverse(struct seq_file *m, loff_t offset) } pos += m->count; m->count = 0; - p = m->op->next(m, p, &m->index); if (pos == offset) break; } diff --git a/fs/timerfd.c b/fs/timerfd.c index 6a6fc8aa1de7..48305ba41e3c 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -471,7 +471,11 @@ static int do_timerfd_settime(int ufd, int flags, break; } spin_unlock_irq(&ctx->wqh.lock); - cpu_relax(); + + if (isalarm(ctx)) + hrtimer_cancel_wait_running(&ctx->t.alarm.timer); + else + hrtimer_cancel_wait_running(&ctx->t.tmr); } /* diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 80d7301ab76d..c0b84e960b20 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -51,7 +51,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write) { down_read(&c->vfs_sb->s_umount); - writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); + writeback_inodes_sb_nr(c->vfs_sb, nr_to_write, WB_REASON_FS_FREE_SPACE); up_read(&c->vfs_sb->s_umount); } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index b52624e28fa1..3b4b4114f208 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -129,7 +129,6 @@ static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o) static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) { if (orph->del) { - spin_unlock(&c->orphan_lock); dbg_gen("deleted twice ino %lu", orph->inum); return; } @@ -138,7 +137,6 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) orph->del = 1; orph->dnext = c->orph_dnext; c->orph_dnext = orph; - spin_unlock(&c->orphan_lock); dbg_gen("delete later ino %lu", orph->inum); return; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 2c0803b0ac3a..8c1d571334bc 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -609,6 +609,10 @@ static int init_constants_early(struct ubifs_info *c) c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; if (c->max_bu_buf_len > c->leb_size) c->max_bu_buf_len = c->leb_size; + + /* Log is ready, preserve one LEB for commits. */ + c->min_log_bytes = c->leb_size; + return 0; } diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index fcb41516ea59..6d30adb6b890 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -9,7 +9,7 @@ config UFS_FS this file system as well. Saying Y here will allow you to read from these partitions; if you also want to write to them, say Y to the experimental "UFS file system write support", below. Please read the - file <file:Documentation/filesystems/ufs.txt> for more information. + file <file:Documentation/admin-guide/ufs.rst> for more information. The recently released UFS2 variant (used in FreeBSD 5.x) is READ-ONLY supported. diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ccbdbd62f0d8..fe6d804a38dc 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -880,6 +880,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + bool still_valid; WRITE_ONCE(ctx->released, true); @@ -895,8 +896,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); - if (!mmget_still_valid(mm)) - goto skip_mm; + still_valid = mmget_still_valid(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -907,19 +907,20 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX); - if (prev) - vma = prev; - else - prev = vma; + if (still_valid) { + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; + } vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } -skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index baf0b72c0a37..07aad70f3931 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3835,15 +3835,28 @@ xfs_bmapi_read( XFS_STATS_INC(mp, xs_blk_mapr); ifp = XFS_IFORK_PTR(ip, whichfork); + if (!ifp) { + /* No CoW fork? Return a hole. */ + if (whichfork == XFS_COW_FORK) { + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = len; + mval->br_state = XFS_EXT_NORM; + *nmap = 1; + return 0; + } - /* No CoW fork? Return a hole. */ - if (whichfork == XFS_COW_FORK && !ifp) { - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = len; - mval->br_state = XFS_EXT_NORM; - *nmap = 1; - return 0; + /* + * A missing attr ifork implies that the inode says we're in + * extents or btree format but failed to pass the inode fork + * verifier while trying to load it. Treat that as a file + * corruption too. + */ +#ifdef DEBUG + xfs_alert(mp, "%s: inode %llu missing fork %d", + __func__, ip->i_ino, whichfork); +#endif /* DEBUG */ + return -EFSCORRUPTED; } if (!(ifp->if_flags & XFS_IFEXTENTS)) { diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index d1c77fd0815d..0bf56e94bfe9 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -487,10 +487,8 @@ xfs_da3_split( ASSERT(state->path.active == 0); oldblk = &state->path.blk[0]; error = xfs_da3_root_split(state, oldblk, addblk); - if (error) { - addblk->bp = NULL; - return error; /* GROT: dir is inconsistent */ - } + if (error) + goto out; /* * Update pointers to the node which used to be block 0 and just got @@ -505,7 +503,10 @@ xfs_da3_split( */ node = oldblk->bp->b_addr; if (node->hdr.info.forw) { - ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno); + if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { + error = -EFSCORRUPTED; + goto out; + } node = addblk->bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); xfs_trans_log_buf(state->args->trans, addblk->bp, @@ -514,15 +515,19 @@ xfs_da3_split( } node = oldblk->bp->b_addr; if (node->hdr.info.back) { - ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno); + if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { + error = -EFSCORRUPTED; + goto out; + } node = addblk->bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); xfs_trans_log_buf(state->args->trans, addblk->bp, XFS_DA_LOGRANGE(node, &node->hdr.info, sizeof(node->hdr.info))); } +out: addblk->bp = NULL; - return 0; + return error; } /* diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index afcc6642690a..1fc44efc344d 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -741,7 +741,8 @@ xfs_dir2_leafn_lookup_for_entry( ents = dp->d_ops->leaf_ents_p(leaf); xfs_dir3_leaf_check(dp, bp); - ASSERT(leafhdr.count > 0); + if (leafhdr.count <= 0) + return -EFSCORRUPTED; /* * Look up the hash value in the leaf entries. diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 7fcf7569743f..7bd7534f5051 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -547,63 +547,12 @@ xfs_file_compat_ioctl( struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - void __user *arg = (void __user *)p; + void __user *arg = compat_ptr(p); int error; trace_xfs_file_compat_ioctl(ip); switch (cmd) { - /* No size or alignment issues on any arch */ - case XFS_IOC_DIOINFO: - case XFS_IOC_FSGEOMETRY_V4: - case XFS_IOC_FSGEOMETRY: - case XFS_IOC_AG_GEOMETRY: - case XFS_IOC_FSGETXATTR: - case XFS_IOC_FSSETXATTR: - case XFS_IOC_FSGETXATTRA: - case XFS_IOC_FSSETDM: - case XFS_IOC_GETBMAP: - case XFS_IOC_GETBMAPA: - case XFS_IOC_GETBMAPX: - case XFS_IOC_FSCOUNTS: - case XFS_IOC_SET_RESBLKS: - case XFS_IOC_GET_RESBLKS: - case XFS_IOC_FSGROWFSLOG: - case XFS_IOC_GOINGDOWN: - case XFS_IOC_ERROR_INJECTION: - case XFS_IOC_ERROR_CLEARALL: - case FS_IOC_GETFSMAP: - case XFS_IOC_SCRUB_METADATA: - case XFS_IOC_BULKSTAT: - case XFS_IOC_INUMBERS: - return xfs_file_ioctl(filp, cmd, p); -#if !defined(BROKEN_X86_ALIGNMENT) || defined(CONFIG_X86_X32) - /* - * These are handled fine if no alignment issues. To support x32 - * which uses native 64-bit alignment we must emit these cases in - * addition to the ia-32 compat set below. - */ - case XFS_IOC_ALLOCSP: - case XFS_IOC_FREESP: - case XFS_IOC_RESVSP: - case XFS_IOC_UNRESVSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - case XFS_IOC_FSGEOMETRY_V1: - case XFS_IOC_FSGROWFSDATA: - case XFS_IOC_FSGROWFSRT: - case XFS_IOC_ZERO_RANGE: -#ifdef CONFIG_X86_X32 - /* - * x32 special: this gets a different cmd number from the ia-32 compat - * case below; the associated data will match native 64-bit alignment. - */ - case XFS_IOC_SWAPEXT: -#endif - return xfs_file_ioctl(filp, cmd, p); -#endif #if defined(BROKEN_X86_ALIGNMENT) case XFS_IOC_ALLOCSP_32: case XFS_IOC_FREESP_32: @@ -705,6 +654,7 @@ xfs_file_compat_ioctl( case XFS_IOC_FSSETDM_BY_HANDLE_32: return xfs_compat_fssetdm_by_handle(filp, arg); default: - return -ENOIOCTLCMD; + /* try the native version */ + return xfs_file_ioctl(filp, cmd, (unsigned long)arg); } } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ff3c1fae5357..fe285d123d69 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -793,6 +793,7 @@ xfs_setattr_nonsize( out_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 00e9f5c388d3..7fc3c1ad36bc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -429,10 +429,7 @@ xfs_log_reserve( ASSERT(*ticp == NULL); tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, - KM_SLEEP | KM_MAYFAIL); - if (!tic) - return -ENOMEM; - + KM_SLEEP); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 0c954cad7449..a339bd5fa260 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -32,7 +32,7 @@ xfs_break_leased_layouts( struct xfs_inode *ip = XFS_I(inode); int error; - while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { + while ((error = break_layout(inode, false)) == -EWOULDBLOCK) { xfs_iunlock(ip, *iolock); *did_unlock = true; error = break_layout(inode, true); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c4ec7afd1170..edbe37b7f636 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1190,11 +1190,11 @@ xfs_reflink_remap_blocks( } /* - * Grab the exclusive iolock for a data copy from src to dest, making - * sure to abide vfs locking order (lowest pointer value goes first) and - * breaking the pnfs layout leases on dest before proceeding. The loop - * is needed because we cannot call the blocking break_layout() with the - * src iolock held, and therefore have to back out both locks. + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. */ static int xfs_iolock_two_inodes_and_break_layout( @@ -1203,33 +1203,44 @@ xfs_iolock_two_inodes_and_break_layout( { int error; -retry: - if (src < dest) { - inode_lock_shared(src); - inode_lock_nested(dest, I_MUTEX_NONDIR2); - } else { - /* src >= dest */ - inode_lock(dest); - } + if (src > dest) + swap(src, dest); - error = break_layout(dest, false); - if (error == -EWOULDBLOCK) { - inode_unlock(dest); - if (src < dest) - inode_unlock_shared(src); +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { error = break_layout(dest, true); if (error) return error; - goto retry; } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); if (error) { + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); + if (error) { + inode_unlock(src); inode_unlock(dest); - if (src < dest) - inode_unlock_shared(src); + if (error == -EWOULDBLOCK) + goto retry; return error; } - if (src > dest) - inode_lock_shared_nested(src, I_MUTEX_NONDIR2); + return 0; } @@ -1247,10 +1258,10 @@ xfs_reflink_remap_unlock( xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_SHARED); + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); inode_unlock(inode_out); if (!same_inode) - inode_unlock_shared(inode_in); + inode_unlock(inode_in); } /* @@ -1325,7 +1336,7 @@ xfs_reflink_remap_prep( if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); else - xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest, + xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest, XFS_MMAPLOCK_EXCL); /* Check file eligibility and prepare for block sharing. */ |