diff options
Diffstat (limited to 'fs')
50 files changed, 480 insertions, 272 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index adaf6f6dd858..e1cbdfdb7c68 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -310,9 +310,13 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - if (unlikely(copied < len && !PageUptodate(page))) { - copied = 0; - goto out; + if (!PageUptodate(page)) { + if (unlikely(copied < len)) { + copied = 0; + goto out; + } else if (len == PAGE_SIZE) { + SetPageUptodate(page); + } } /* * No need to use i_size_read() here, the i_size diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ce7181ea60fa..a7c5a9861bef 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -54,7 +54,7 @@ typedef struct { int size; /* size of magic/mask */ char *magic; /* magic or filename extension */ char *mask; /* mask, NULL for exact match */ - char *interpreter; /* filename of interpreter */ + const char *interpreter; /* filename of interpreter */ char *name; struct dentry *dentry; struct file *interp_file; @@ -131,27 +131,26 @@ static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file *interp_file = NULL; - char iname[BINPRM_BUF_SIZE]; - const char *iname_addr = iname; int retval; int fd_binary = -1; retval = -ENOEXEC; if (!enabled) - goto ret; + return retval; /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); if (fmt) - strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); + dget(fmt->dentry); read_unlock(&entries_lock); if (!fmt) - goto ret; + return retval; /* Need to be able to load the file after exec */ + retval = -ENOENT; if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) - return -ENOENT; + goto ret; if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { retval = remove_arg_zero(bprm); @@ -195,22 +194,22 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel(1, &iname_addr, bprm); + retval = copy_strings_kernel(1, &fmt->interpreter, bprm); if (retval < 0) goto error; bprm->argc++; /* Update interp in case binfmt_script needs it. */ - retval = bprm_change_interp(iname, bprm); + retval = bprm_change_interp(fmt->interpreter, bprm); if (retval < 0) goto error; - if (fmt->flags & MISC_FMT_OPEN_FILE && fmt->interp_file) { + if (fmt->flags & MISC_FMT_OPEN_FILE) { interp_file = filp_clone_open(fmt->interp_file); if (!IS_ERR(interp_file)) deny_write_access(interp_file); } else { - interp_file = open_exec(iname); + interp_file = open_exec(fmt->interpreter); } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) @@ -238,6 +237,7 @@ static int load_misc_binary(struct linux_binprm *bprm) goto error; ret: + dput(fmt->dentry); return retval; error: if (fd_binary > 0) @@ -594,8 +594,13 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) static void bm_evict_inode(struct inode *inode) { + Node *e = inode->i_private; + + if (e && e->flags & MISC_FMT_OPEN_FILE) + filp_close(e->interp_file, NULL); + clear_inode(inode); - kfree(inode->i_private); + kfree(e); } static void kill_node(Node *e) @@ -603,24 +608,14 @@ static void kill_node(Node *e) struct dentry *dentry; write_lock(&entries_lock); - dentry = e->dentry; - if (dentry) { - list_del_init(&e->list); - e->dentry = NULL; - } + list_del_init(&e->list); write_unlock(&entries_lock); - if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) { - filp_close(e->interp_file, NULL); - e->interp_file = NULL; - } - - if (dentry) { - drop_nlink(d_inode(dentry)); - d_drop(dentry); - dput(dentry); - simple_release_fs(&bm_mnt, &entry_count); - } + dentry = e->dentry; + drop_nlink(d_inode(dentry)); + d_drop(dentry); + dput(dentry); + simple_release_fs(&bm_mnt, &entry_count); } /* /<entry> */ @@ -665,7 +660,8 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, root = file_inode(file)->i_sb->s_root; inode_lock(d_inode(root)); - kill_node(e); + if (!list_empty(&e->list)) + kill_node(e); inode_unlock(d_inode(root)); break; @@ -794,7 +790,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, inode_lock(d_inode(root)); while (!list_empty(&entries)) - kill_node(list_entry(entries.next, Node, list)); + kill_node(list_first_entry(&entries, Node, list)); inode_unlock(d_inode(root)); break; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index afdf4e3cafc2..7cde3f46ad26 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -19,7 +19,6 @@ static int load_script(struct linux_binprm *bprm) const char *i_arg, *i_name; char *cp; struct file *file; - char interp[BINPRM_BUF_SIZE]; int retval; if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) @@ -55,7 +54,7 @@ static int load_script(struct linux_binprm *bprm) break; } for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++); - if (*cp == '\0') + if (*cp == '\0') return -ENOEXEC; /* No interpreter name found */ i_name = cp; i_arg = NULL; @@ -65,7 +64,6 @@ static int load_script(struct linux_binprm *bprm) *cp++ = '\0'; if (*cp) i_arg = cp; - strcpy (interp, i_name); /* * OK, we've parsed out the interpreter name and * (optional) argument. @@ -80,24 +78,27 @@ static int load_script(struct linux_binprm *bprm) if (retval) return retval; retval = copy_strings_kernel(1, &bprm->interp, bprm); - if (retval < 0) return retval; + if (retval < 0) + return retval; bprm->argc++; if (i_arg) { retval = copy_strings_kernel(1, &i_arg, bprm); - if (retval < 0) return retval; + if (retval < 0) + return retval; bprm->argc++; } retval = copy_strings_kernel(1, &i_name, bprm); - if (retval) return retval; + if (retval) + return retval; bprm->argc++; - retval = bprm_change_interp(interp, bprm); + retval = bprm_change_interp(i_name, bprm); if (retval < 0) return retval; /* * OK, now restart the process with the interpreter's dentry. */ - file = open_exec(interp); + file = open_exec(i_name); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/fs/block_dev.c b/fs/block_dev.c index 93d088ffc05c..789f55e851ae 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -716,10 +716,12 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true); - if (result) + if (result) { end_page_writeback(page); - else + } else { + clean_page_buffers(page); unlock_page(page); + } blk_queue_exit(bdev->bd_queue); return result; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 899ddaeeacec..8fc690384c58 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -722,7 +722,7 @@ struct btrfs_delayed_root; * Indicate that a whole-filesystem exclusive operation is running * (device replace, resize, device add/delete, balance) */ -#define BTRFS_FS_EXCL_OP 14 +#define BTRFS_FS_EXCL_OP 16 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 12ab19a4b93e..970190cd347e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2801,7 +2801,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, sector << 9); + bio = btrfs_bio_alloc(bdev, (u64)sector << 9); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 84edfc60d87a..f23c820daaed 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -734,12 +734,13 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = req->r_inode; ihold(inode); } else { - /* req->r_dentry is non-null for LSSNAP request. - * fall-thru */ - WARN_ON_ONCE(!req->r_dentry); + /* req->r_dentry is non-null for LSSNAP request */ + rcu_read_lock(); + inode = get_nonsnap_parent(req->r_dentry); + rcu_read_unlock(); + dout("__choose_mds using snapdir's parent %p\n", inode); } - } - if (!inode && req->r_dentry) { + } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent; struct inode *dir; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 1ffc8b426c1c..7fc0b850c352 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -374,12 +374,10 @@ static int build_snap_context(struct ceph_snap_realm *realm, realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - if (realm->cached_context) { - ceph_put_snap_context(realm->cached_context); - /* queue realm for cap_snap creation */ - list_add_tail(&realm->dirty_item, dirty_realms); - } + ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; + /* queue realm for cap_snap creation */ + list_add_tail(&realm->dirty_item, dirty_realms); return 0; fail: diff --git a/fs/direct-io.c b/fs/direct-io.c index 62cf812ed0e5..b53e66d9abd7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -45,6 +45,12 @@ #define DIO_PAGES 64 /* + * Flags for dio_complete() + */ +#define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */ +#define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */ + +/* * This code generally works in units of "dio_blocks". A dio_block is * somewhere between the hard sector size and the filesystem block size. it * is determined on a per-invocation basis. When talking to the filesystem @@ -225,7 +231,7 @@ static inline struct page *dio_get_page(struct dio *dio, * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) { loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; @@ -259,14 +265,27 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if (ret == 0) ret = transferred; + if (dio->end_io) { + // XXX: ki_pos?? + err = dio->end_io(dio->iocb, offset, ret, dio->private); + if (err) + ret = err; + } + /* * Try again to invalidate clean pages which might have been cached by * non-direct readahead, or faulted in by get_user_pages() if the source * of the write was an mmap'ed region of the file we're writing. Either * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... + * + * And this page cache invalidation has to be after dio->end_io(), as + * some filesystems convert unwritten extents to real allocations in + * end_io() when necessary, otherwise a racing buffer read would cache + * zeros from unwritten extents. */ - if (ret > 0 && dio->op == REQ_OP_WRITE && + if (flags & DIO_COMPLETE_INVALIDATE && + ret > 0 && dio->op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages) { err = invalidate_inode_pages2_range(dio->inode->i_mapping, offset >> PAGE_SHIFT, @@ -274,18 +293,10 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) WARN_ON_ONCE(err); } - if (dio->end_io) { - - // XXX: ki_pos?? - err = dio->end_io(dio->iocb, offset, ret, dio->private); - if (err) - ret = err; - } - if (!(dio->flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(dio->inode); - if (is_async) { + if (flags & DIO_COMPLETE_ASYNC) { /* * generic_write_sync expects ki_pos to have been updated * already, but the submission path only does this for @@ -306,7 +317,7 @@ static void dio_aio_complete_work(struct work_struct *work) { struct dio *dio = container_of(work, struct dio, complete_work); - dio_complete(dio, 0, true); + dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE); } static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); @@ -348,7 +359,7 @@ static void dio_bio_end_aio(struct bio *bio) queue_work(dio->inode->i_sb->s_dio_done_wq, &dio->complete_work); } else { - dio_complete(dio, 0, true); + dio_complete(dio, 0, DIO_COMPLETE_ASYNC); } } } @@ -866,7 +877,8 @@ out: */ if (sdio->boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); - dio_bio_submit(dio, sdio); + if (sdio->bio) + dio_bio_submit(dio, sdio); put_page(sdio->cur_page); sdio->cur_page = NULL; } @@ -1359,7 +1371,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio_await_completion(dio); if (drop_refcount(dio) == 0) { - retval = dio_complete(dio, retval, false); + retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE); } else BUG_ON(retval != -EIOCBQUEUED); diff --git a/fs/exec.c b/fs/exec.c index ac34d9724684..5470d3c1892a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1410,7 +1410,7 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -int bprm_change_interp(char *interp, struct linux_binprm *bprm) +int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a7c90386947..4b4a72f392be 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2525,7 +2525,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 621b9b3d320b..c695ff462ee6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1210,11 +1210,11 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) { __issue_discard_cmd(sbi, false); __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, false); + __wait_discard_cmd(sbi, !umount); } static void mark_discard_range_all(struct f2fs_sb_info *sbi) @@ -2244,7 +2244,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) } /* It's time to issue all the filed discards */ mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, false); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 89f61eb3d167..933c3d529e65 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, true); if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { diff --git a/fs/iomap.c b/fs/iomap.c index be61cf742b5e..d4801f8dd4fd 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -714,23 +714,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) { struct kiocb *iocb = dio->iocb; struct inode *inode = file_inode(iocb->ki_filp); + loff_t offset = iocb->ki_pos; ssize_t ret; - /* - * Try again to invalidate clean pages which might have been cached by - * non-direct readahead, or faulted in by get_user_pages() if the source - * of the write was an mmap'ed region of the file we're writing. Either - * one is a pretty crazy thing to do, so we don't support it 100%. If - * this invalidation fails, tough, the write still worked... - */ - if (!dio->error && - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { - ret = invalidate_inode_pages2_range(inode->i_mapping, - iocb->ki_pos >> PAGE_SHIFT, - (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - } - if (dio->end_io) { ret = dio->end_io(iocb, dio->error ? dio->error : dio->size, @@ -742,12 +728,33 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) if (likely(!ret)) { ret = dio->size; /* check for short read */ - if (iocb->ki_pos + ret > dio->i_size && + if (offset + ret > dio->i_size && !(dio->flags & IOMAP_DIO_WRITE)) - ret = dio->i_size - iocb->ki_pos; + ret = dio->i_size - offset; iocb->ki_pos += ret; } + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + * + * And this page cache invalidation has to be after dio->end_io(), as + * some filesystems convert unwritten extents to real allocations in + * end_io() when necessary, otherwise a racing buffer read would cache + * zeros from unwritten extents. + */ + if (!dio->error && + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { + int err; + err = invalidate_inode_pages2_range(inode->i_mapping, + offset >> PAGE_SHIFT, + (offset + dio->size - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(err); + } + inode_dio_end(file_inode(iocb->ki_filp)); kfree(dio); diff --git a/fs/mpage.c b/fs/mpage.c index 37bb77c1302c..c991faec70b9 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -468,6 +468,16 @@ static void clean_buffers(struct page *page, unsigned first_unmapped) try_to_free_buffers(page); } +/* + * For situations where we want to clean all buffers attached to a page. + * We don't need to calculate how many buffers are attached to the page, + * we just need to specify a number larger than the maximum number of buffers. + */ +void clean_page_buffers(struct page *page) +{ + clean_buffers(page, ~0U); +} + static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -605,10 +615,8 @@ alloc_new: if (bio == NULL) { if (first_unmapped == blocks_per_page) { if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), - page, wbc)) { - clean_buffers(page, first_unmapped); + page, wbc)) goto out; - } } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); diff --git a/fs/namespace.c b/fs/namespace.c index 54059b142d6b..3b601f115b6c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -468,7 +468,9 @@ static inline int may_write_real(struct file *file) /* File refers to upper, writable layer? */ upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); - if (upperdentry && file_inode(file) == d_inode(upperdentry)) + if (upperdentry && + (file_inode(file) == d_inode(upperdentry) || + file_inode(file) == d_inode(dentry))) return 0; /* Lower layer: can't write to real file, sorry... */ diff --git a/fs/nfs/client.c b/fs/nfs/client.c index efebe6cf4378..22880ef6d8dd 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -218,7 +218,6 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); - rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); } #else @@ -888,6 +887,7 @@ struct nfs_server *nfs_alloc_server(void) ida_init(&server->openowner_id); ida_init(&server->lockowner_id); pnfs_init_server(server); + rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; } diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 44c638b7876c..508126eb49f9 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -745,7 +745,8 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); dprintk("--> %s\n", __func__); - nfs4_fl_put_deviceid(fl->dsaddr); + if (fl->dsaddr != NULL) + nfs4_fl_put_deviceid(fl->dsaddr); /* This assumes a single RW lseg */ if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_filelayout *flo; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index dd5d27da8c0c..30426c1a1bbd 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -274,7 +274,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, ssize_t ret; ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); - if (ret <= 0) + if (ret < 0) return ERR_PTR(ret); rkey = request_key(&key_type_id_resolver, desc, ""); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6c61e2b99635..f90090e8c959 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8399,8 +8399,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, lo = NFS_I(inode)->layout; /* If the open stateid was bad, then recover it. */ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || - nfs4_stateid_match_other(&lgp->args.stateid, - &lgp->args.ctx->state->stateid)) { + !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { spin_unlock(&inode->i_lock); exception->state = lgp->args.ctx->state; exception->stateid = &lgp->args.stateid; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 37c8af003275..14ed9791ec9c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1842,8 +1842,8 @@ static void encode_create_session(struct xdr_stream *xdr, * Assumes OPEN is the biggest non-idempotent compound. * 2 is the verifier. */ - max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + - RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; + max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 2) + * XDR_UNIT + RPC_MAX_AUTH_SIZE; encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 3c69db7d4905..8487486ec496 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -927,6 +927,13 @@ nfsd4_secinfo_release(union nfsd4_op_u *u) exp_put(u->secinfo.si_exp); } +static void +nfsd4_secinfo_no_name_release(union nfsd4_op_u *u) +{ + if (u->secinfo_no_name.sin_exp) + exp_put(u->secinfo_no_name.sin_exp); +} + static __be32 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -2375,7 +2382,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_SECINFO_NO_NAME] = { .op_func = nfsd4_secinfo_no_name, - .op_release = nfsd4_secinfo_release, + .op_release = nfsd4_secinfo_no_name_release, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", .op_rsize_bop = nfsd4_secinfo_rsize, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index aad97b30d5e6..c441f9387a1b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -561,10 +561,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) c->tmpfile = true; err = ovl_copy_up_locked(c); } else { - err = -EIO; - if (lock_rename(c->workdir, c->destdir) != NULL) { - pr_err("overlayfs: failed to lock workdir+upperdir\n"); - } else { + err = ovl_lock_rename_workdir(c->workdir, c->destdir); + if (!err) { err = ovl_copy_up_locked(c); unlock_rename(c->workdir, c->destdir); } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 3309b1912241..cc961a3bd3bd 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -216,26 +216,6 @@ out_unlock: return err; } -static int ovl_lock_rename_workdir(struct dentry *workdir, - struct dentry *upperdir) -{ - /* Workdir should not be the same as upperdir */ - if (workdir == upperdir) - goto err; - - /* Workdir should not be subdir of upperdir and vice versa */ - if (lock_rename(workdir, upperdir) != NULL) - goto err_unlock; - - return 0; - -err_unlock: - unlock_rename(workdir, upperdir); -err: - pr_err("overlayfs: failed to lock workdir+upperdir\n"); - return -EIO; -} - static struct dentry *ovl_clear_empty(struct dentry *dentry, struct list_head *list) { diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index c3addd1114f1..654bea1a5ac9 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -506,6 +506,7 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { + err = PTR_ERR(index); pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d4e8c1a08fb0..c706a6f99928 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -235,6 +235,7 @@ bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry, bool *locked); void ovl_nlink_end(struct dentry *dentry, bool locked); +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); static inline bool ovl_is_impuredir(struct dentry *dentry) { diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 878a750986dd..25d9b5adcd42 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -37,6 +37,9 @@ struct ovl_fs { bool noxattr; /* sb common to all layers */ struct super_block *same_sb; + /* Did we take the inuse lock? */ + bool upperdir_locked; + bool workdir_locked; }; /* private information held for every overlayfs dentry */ diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 62e9b22a2077..0f85ee9c3268 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -988,6 +988,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, struct path *lowerstack, unsigned int numlower) { int err; + struct dentry *index = NULL; struct inode *dir = dentry->d_inode; struct path path = { .mnt = mnt, .dentry = dentry }; LIST_HEAD(list); @@ -1007,8 +1008,6 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, inode_lock_nested(dir, I_MUTEX_PARENT); list_for_each_entry(p, &list, l_node) { - struct dentry *index; - if (p->name[0] == '.') { if (p->len == 1) continue; @@ -1018,6 +1017,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, index = lookup_one_len(p->name, dentry, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); + index = NULL; break; } err = ovl_verify_index(index, lowerstack, numlower); @@ -1029,7 +1029,9 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, break; } dput(index); + index = NULL; } + dput(index); inode_unlock(dir); out: ovl_cache_free(&list); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index fd5ea4facc62..092d150643c1 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -211,9 +211,10 @@ static void ovl_put_super(struct super_block *sb) dput(ufs->indexdir); dput(ufs->workdir); - ovl_inuse_unlock(ufs->workbasedir); + if (ufs->workdir_locked) + ovl_inuse_unlock(ufs->workbasedir); dput(ufs->workbasedir); - if (ufs->upper_mnt) + if (ufs->upper_mnt && ufs->upperdir_locked) ovl_inuse_unlock(ufs->upper_mnt->mnt_root); mntput(ufs->upper_mnt); for (i = 0; i < ufs->numlower; i++) @@ -881,9 +882,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_put_upperpath; err = -EBUSY; - if (!ovl_inuse_trylock(upperpath.dentry)) { - pr_err("overlayfs: upperdir is in-use by another mount\n"); + if (ovl_inuse_trylock(upperpath.dentry)) { + ufs->upperdir_locked = true; + } else if (ufs->config.index) { + pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); goto out_put_upperpath; + } else { + pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); } err = ovl_mount_dir(ufs->config.workdir, &workpath); @@ -901,9 +906,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } err = -EBUSY; - if (!ovl_inuse_trylock(workpath.dentry)) { - pr_err("overlayfs: workdir is in-use by another mount\n"); + if (ovl_inuse_trylock(workpath.dentry)) { + ufs->workdir_locked = true; + } else if (ufs->config.index) { + pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); goto out_put_workpath; + } else { + pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); } ufs->workbasedir = workpath.dentry; @@ -1156,11 +1165,13 @@ out_put_lowerpath: out_free_lowertmp: kfree(lowertmp); out_unlock_workdentry: - ovl_inuse_unlock(workpath.dentry); + if (ufs->workdir_locked) + ovl_inuse_unlock(workpath.dentry); out_put_workpath: path_put(&workpath); out_unlock_upperdentry: - ovl_inuse_unlock(upperpath.dentry); + if (ufs->upperdir_locked) + ovl_inuse_unlock(upperpath.dentry); out_put_upperpath: path_put(&upperpath); out_free_config: diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 117794582f9f..b9b239fa5cfd 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -430,7 +430,7 @@ void ovl_inuse_unlock(struct dentry *dentry) } } -/* Called must hold OVL_I(inode)->oi_lock */ +/* Caller must hold OVL_I(inode)->lock */ static void ovl_cleanup_index(struct dentry *dentry) { struct inode *dir = ovl_indexdir(dentry->d_sb)->d_inode; @@ -469,6 +469,9 @@ static void ovl_cleanup_index(struct dentry *dentry) err = PTR_ERR(index); if (!IS_ERR(index)) err = ovl_cleanup(dir, index); + else + index = NULL; + inode_unlock(dir); if (err) goto fail; @@ -557,3 +560,22 @@ void ovl_nlink_end(struct dentry *dentry, bool locked) mutex_unlock(&OVL_I(d_inode(dentry))->lock); } } + +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + return -EIO; +} diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 50b0556a124f..52ad15192e72 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1297,21 +1297,18 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space, spin_lock(&dquot->dq_dqb_lock); if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - goto add; + goto finish; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space + rsv_space; - if (flags & DQUOT_SPACE_NOFAIL) - goto add; - if (dquot->dq_dqb.dqb_bhardlimit && tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); ret = -EDQUOT; - goto out; + goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1322,7 +1319,7 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space, if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); ret = -EDQUOT; - goto out; + goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1338,13 +1335,21 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space, * be always printed */ ret = -EDQUOT; - goto out; + goto finish; } } -add: - dquot->dq_dqb.dqb_rsvspace += rsv_space; - dquot->dq_dqb.dqb_curspace += space; -out: +finish: + /* + * We have to be careful and go through warning generation & grace time + * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it + * only here... + */ + if (flags & DQUOT_SPACE_NOFAIL) + ret = 0; + if (!ret) { + dquot->dq_dqb.dqb_rsvspace += rsv_space; + dquot->dq_dqb.dqb_curspace += space; + } spin_unlock(&dquot->dq_dqb_lock); return ret; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ef4b48d1ea42..1c713fd5b3e6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -588,6 +588,12 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, break; if (ACCESS_ONCE(ctx->released) || fatal_signal_pending(current)) { + /* + * &ewq->wq may be queued in fork_event, but + * __remove_wait_queue ignores the head + * parameter. It would be a problem if it + * didn't. + */ __remove_wait_queue(&ctx->event_wqh, &ewq->wq); if (ewq->msg.event == UFFD_EVENT_FORK) { struct userfaultfd_ctx *new; @@ -1061,6 +1067,12 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, (unsigned long) uwq->msg.arg.reserved.reserved1; list_move(&uwq->wq.entry, &fork_event); + /* + * fork_nctx can be freed as soon as + * we drop the lock, unless we take a + * reference on it. + */ + userfaultfd_ctx_get(fork_nctx); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; @@ -1091,19 +1103,53 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); + spin_lock(&ctx->event_wqh.lock); + if (!list_empty(&fork_event)) { + /* + * The fork thread didn't abort, so we can + * drop the temporary refcount. + */ + userfaultfd_ctx_put(fork_nctx); + + uwq = list_first_entry(&fork_event, + typeof(*uwq), + wq.entry); + /* + * If fork_event list wasn't empty and in turn + * the event wasn't already released by fork + * (the event is allocated on fork kernel + * stack), put the event back to its place in + * the event_wq. fork_event head will be freed + * as soon as we return so the event cannot + * stay queued there no matter the current + * "ret" value. + */ + list_del(&uwq->wq.entry); + __add_wait_queue(&ctx->event_wqh, &uwq->wq); - if (!ret) { - spin_lock(&ctx->event_wqh.lock); - if (!list_empty(&fork_event)) { - uwq = list_first_entry(&fork_event, - typeof(*uwq), - wq.entry); - list_del(&uwq->wq.entry); - __add_wait_queue(&ctx->event_wqh, &uwq->wq); + /* + * Leave the event in the waitqueue and report + * error to userland if we failed to resolve + * the userfault fork. + */ + if (likely(!ret)) userfaultfd_event_complete(ctx, uwq); - } - spin_unlock(&ctx->event_wqh.lock); + } else { + /* + * Here the fork thread aborted and the + * refcount from the fork thread on fork_nctx + * has already been released. We still hold + * the reference we took before releasing the + * lock above. If resolve_userfault_fork + * failed we've to drop it because the + * fork_nctx has to be freed in such case. If + * it succeeded we'll hold it because the new + * uffd references it. + */ + if (ret) + userfaultfd_ctx_put(fork_nctx); } + spin_unlock(&ctx->event_wqh.lock); } return ret; diff --git a/fs/xattr.c b/fs/xattr.c index 4424f7fecf14..61cd28ba25f3 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -250,7 +250,7 @@ xattr_getsecurity(struct inode *inode, const char *name, void *value, } memcpy(value, buffer, len); out: - security_release_secctx(buffer, len); + kfree(buffer); out_noalloc: return len; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 744dcaec34cc..f965ce832bc0 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1584,6 +1584,10 @@ xfs_alloc_ag_vextent_small( bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno, 0); + if (!bp) { + error = -EFSCORRUPTED; + goto error0; + } xfs_trans_binval(args->tp, bp); } args->len = 1; @@ -2141,6 +2145,10 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); + if (!bp) { + error = -EFSCORRUPTED; + goto out_agbp_relse; + } xfs_trans_binval(tp, bp); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 044a363119be..89263797cf32 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1477,14 +1477,14 @@ xfs_bmap_isaeof( int is_empty; int error; - bma->aeof = 0; + bma->aeof = false; error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, &is_empty); if (error) return error; if (is_empty) { - bma->aeof = 1; + bma->aeof = true; return 0; } @@ -3852,6 +3852,17 @@ xfs_trim_extent( } } +/* trim extent to within eof */ +void +xfs_trim_extent_eof( + struct xfs_bmbt_irec *irec, + struct xfs_inode *ip) + +{ + xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, + i_size_read(VFS_I(ip)))); +} + /* * Trim the returned map to the required bounds */ diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 851982a5dfbc..502e0d8fb4ff 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -208,6 +208,7 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); +void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 988bb3f31446..dfd643909f85 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1962,7 +1962,7 @@ xfs_difree_inobt( if (!(mp->m_flags & XFS_MOUNT_IKEEP) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { - xic->deleted = 1; + xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); @@ -1989,7 +1989,7 @@ xfs_difree_inobt( xfs_difree_inode_chunk(mp, agno, &rec, dfops); } else { - xic->deleted = 0; + xic->deleted = false; error = xfs_inobt_update(cur, &rec); if (error) { diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8372e9bcd7b6..71de185735e0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -270,6 +270,7 @@ typedef struct xfs_inode_log_format { uint32_t ilf_fields; /* flags for fields logged */ uint16_t ilf_asize; /* size of attr d/ext/root */ uint16_t ilf_dsize; /* size of data/ext/root */ + uint32_t ilf_pad; /* pad for 64 bit boundary */ uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ @@ -280,29 +281,17 @@ typedef struct xfs_inode_log_format { int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_t; -typedef struct xfs_inode_log_format_32 { - uint16_t ilf_type; /* inode log item type */ - uint16_t ilf_size; /* size of this item */ - uint32_t ilf_fields; /* flags for fields logged */ - uint16_t ilf_asize; /* size of attr d/ext/root */ - uint16_t ilf_dsize; /* size of data/ext/root */ - uint64_t ilf_ino; /* inode number */ - union { - uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - int64_t ilf_blkno; /* blkno of inode buffer */ - int32_t ilf_len; /* len of inode buffer */ - int32_t ilf_boffset; /* off of inode in buffer */ -} __attribute__((packed)) xfs_inode_log_format_32_t; - -typedef struct xfs_inode_log_format_64 { +/* + * Old 32 bit systems will log in this format without the 64 bit + * alignment padding. Recovery will detect this and convert it to the + * correct format. + */ +struct xfs_inode_log_format_32 { uint16_t ilf_type; /* inode log item type */ uint16_t ilf_size; /* size of this item */ uint32_t ilf_fields; /* flags for fields logged */ uint16_t ilf_asize; /* size of attr d/ext/root */ uint16_t ilf_dsize; /* size of data/ext/root */ - uint32_t ilf_pad; /* pad for 64 bit boundary */ uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ @@ -311,7 +300,7 @@ typedef struct xfs_inode_log_format_64 { int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_64_t; +} __attribute__((packed)); /* diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 7034e17535de..3354140de07e 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_t mode) int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + umode_t mode; + bool set_mode = false; int error = 0; if (!acl) @@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; if (type == ACL_TYPE_ACCESS) { - umode_t mode; - error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - error = xfs_set_mode(inode, mode); - if (error) - return error; + set_mode = true; } set_acl: - return __xfs_set_acl(inode, acl, type); + error = __xfs_set_acl(inode, acl, type); + if (error) + return error; + + /* + * We set the mode after successfully updating the ACL xattr because the + * xattr update can fail at ENOSPC and we don't want to change the mode + * if the ACL update hasn't been applied. + */ + if (set_mode) + error = xfs_set_mode(inode, mode); + + return error; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f18e5932aec4..a3eeaba156c5 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -446,6 +446,19 @@ xfs_imap_valid( { offset >>= inode->i_blkbits; + /* + * We have to make sure the cached mapping is within EOF to protect + * against eofblocks trimming on file release leaving us with a stale + * mapping. Otherwise, a page for a subsequent file extending buffered + * write could get picked up by this writeback cycle and written to the + * wrong blocks. + * + * Note that what we really want here is a generic mapping invalidation + * mechanism to protect us from arbitrary extent modifying contexts, not + * just eofblocks. + */ + xfs_trim_extent_eof(imap, XFS_I(inode)); + return offset >= imap->br_startoff && offset < imap->br_startoff + imap->br_blockcount; } @@ -735,6 +748,14 @@ xfs_vm_invalidatepage( { trace_xfs_invalidatepage(page->mapping->host, page, offset, length); + + /* + * If we are invalidating the entire page, clear the dirty state from it + * so that we can check for attempts to release dirty cached pages in + * xfs_vm_releasepage(). + */ + if (offset == 0 && length >= PAGE_SIZE) + cancel_dirty_page(page); block_invalidatepage(page, offset, length); } @@ -1190,25 +1211,27 @@ xfs_vm_releasepage( * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to * ->releasepage() via shrink_active_list(). Conversely, - * block_invalidatepage() can send pages that are still marked dirty - * but otherwise have invalidated buffers. + * block_invalidatepage() can send pages that are still marked dirty but + * otherwise have invalidated buffers. * * We want to release the latter to avoid unnecessary buildup of the - * LRU, skip the former and warn if we've left any lingering - * delalloc/unwritten buffers on clean pages. Skip pages with delalloc - * or unwritten buffers and warn if the page is not dirty. Otherwise - * try to release the buffers. + * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages + * that are entirely invalidated and need to be released. Hence the + * only time we should get dirty pages here is through + * shrink_active_list() and so we can simply skip those now. + * + * warn if we've left any lingering delalloc/unwritten buffers on clean + * or invalidated pages we are about to release. */ + if (PageDirty(page)) + return 0; + xfs_count_page_state(page, &delalloc, &unwritten); - if (delalloc) { - WARN_ON_ONCE(!PageDirty(page)); + if (WARN_ON_ONCE(delalloc)) return 0; - } - if (unwritten) { - WARN_ON_ONCE(!PageDirty(page)); + if (WARN_ON_ONCE(unwritten)) return 0; - } return try_to_free_buffers(page); } diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index ebd66b19fbfc..e3a950ed35a8 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -302,6 +302,8 @@ xfs_attr3_node_inactive( &bp, XFS_ATTR_FORK); if (error) return error; + node = bp->b_addr; + btree = dp->d_ops->node_tree_p(node); child_fsb = be32_to_cpu(btree[i + 1].before); xfs_trans_brelse(*trans, bp); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index bc6c6e10a969..6503cfa44262 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -84,6 +84,7 @@ xfs_zero_extent( GFP_NOFS, 0); } +#ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ @@ -190,6 +191,7 @@ xfs_bmap_rtalloc( } return 0; } +#endif /* CONFIG_XFS_RT */ /* * Check if the endoff is outside the last extent. If so the caller will grow @@ -2122,11 +2124,31 @@ xfs_swap_extents( ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; + } + + /* Swap the cow forks. */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + xfs_extnum_t extnum; + + ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS); + ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS); + + extnum = ip->i_cnextents; + ip->i_cnextents = tip->i_cnextents; + tip->i_cnextents = extnum; + cowfp = ip->i_cowfp; ip->i_cowfp = tip->i_cowfp; tip->i_cowfp = cowfp; - xfs_inode_set_cowblocks_tag(ip); - xfs_inode_set_cowblocks_tag(tip); + + if (ip->i_cowfp && ip->i_cnextents) + xfs_inode_set_cowblocks_tag(ip); + else + xfs_inode_clear_cowblocks_tag(ip); + if (tip->i_cowfp && tip->i_cnextents) + xfs_inode_set_cowblocks_tag(tip); + else + xfs_inode_clear_cowblocks_tag(tip); } xfs_trans_log_inode(tp, ip, src_log_flags); diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 0eaa81dc49be..7d330b3c77c3 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -28,7 +28,20 @@ struct xfs_mount; struct xfs_trans; struct xfs_bmalloca; +#ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); +#else /* !CONFIG_XFS_RT */ +/* + * Attempts to allocate RT extents when RT is disable indicates corruption and + * should trigger a shutdown. + */ +static inline int +xfs_bmap_rtalloc(struct xfs_bmalloca *ap) +{ + return -EFSCORRUPTED; +} +#endif /* CONFIG_XFS_RT */ + int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 309e26c9dddb..56d0e526870c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -764,7 +764,7 @@ xfs_file_fallocate( enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; - bool do_file_insert = 0; + bool do_file_insert = false; if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -825,7 +825,7 @@ xfs_file_fallocate( error = -EINVAL; goto out_unlock; } - do_file_insert = 1; + do_file_insert = true; } else { flags |= XFS_PREALLOC_SET; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 814ed729881d..43cfc07996a4 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -367,29 +367,6 @@ xfs_getfsmap_datadev_helper( return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); } -/* Transform a rtbitmap "record" into a fsmap */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap_helper( - struct xfs_trans *tp, - struct xfs_rtalloc_rec *rec, - void *priv) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_getfsmap_info *info = priv; - struct xfs_rmap_irec irec; - xfs_daddr_t rec_daddr; - - rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); - - irec.rm_startblock = rec->ar_startblock; - irec.rm_blockcount = rec->ar_blockcount; - irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ - irec.rm_offset = 0; - irec.rm_flags = 0; - - return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); -} - /* Transform a bnobt irec into a fsmap */ STATIC int xfs_getfsmap_datadev_bnobt_helper( @@ -475,6 +452,30 @@ xfs_getfsmap_logdev( return xfs_getfsmap_helper(tp, info, &rmap, 0); } +#ifdef CONFIG_XFS_RT +/* Transform a rtbitmap "record" into a fsmap */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_getfsmap_info *info = priv; + struct xfs_rmap_irec irec; + xfs_daddr_t rec_daddr; + + rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); + + irec.rm_startblock = rec->ar_startblock; + irec.rm_blockcount = rec->ar_blockcount; + irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ + irec.rm_offset = 0; + irec.rm_flags = 0; + + return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); +} + /* Execute a getfsmap query against the realtime device. */ STATIC int __xfs_getfsmap_rtdev( @@ -561,6 +562,7 @@ xfs_getfsmap_rtdev_rtbitmap( return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query, info); } +#endif /* CONFIG_XFS_RT */ /* Execute a getfsmap query against the regular data device. */ STATIC int @@ -795,7 +797,15 @@ xfs_getfsmap_check_keys( return false; } +/* + * There are only two devices if we didn't configure RT devices at build time. + */ +#ifdef CONFIG_XFS_RT #define XFS_GETFSMAP_DEVS 3 +#else +#define XFS_GETFSMAP_DEVS 2 +#endif /* CONFIG_XFS_RT */ + /* * Get filesystem's extents as described in head, and format for * output. Calls formatter to fill the user's buffer until all @@ -853,10 +863,12 @@ xfs_getfsmap( handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); handlers[1].fn = xfs_getfsmap_logdev; } +#ifdef CONFIG_XFS_RT if (mp->m_rtdev_targp) { handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; } +#endif /* CONFIG_XFS_RT */ xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev), xfs_getfsmap_dev_compare); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a705f34b58fa..9bbc2d7cc8cb 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -364,6 +364,9 @@ xfs_inode_to_log_dinode( to->di_dmstate = from->di_dmstate; to->di_flags = from->di_flags; + /* log a dummy value to ensure log structure is fully initialised */ + to->di_next_unlinked = NULLAGINO; + if (from->di_version == 3) { to->di_changecount = inode->i_version; to->di_crtime.t_sec = from->di_crtime.t_sec; @@ -404,6 +407,11 @@ xfs_inode_item_format_core( * the second with the on-disk inode structure, and a possible third and/or * fourth with the inode data/extents/b-tree root and inode attributes * data/extents/b-tree root. + * + * Note: Always use the 64 bit inode log format structure so we don't + * leave an uninitialised hole in the format item on 64 bit systems. Log + * recovery on 32 bit systems handles this just fine, so there's no reason + * for not using an initialising the properly padded structure all the time. */ STATIC void xfs_inode_item_format( @@ -412,8 +420,8 @@ xfs_inode_item_format( { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - struct xfs_inode_log_format *ilf; struct xfs_log_iovec *vecp = NULL; + struct xfs_inode_log_format *ilf; ASSERT(ip->i_d.di_version > 1); @@ -425,7 +433,17 @@ xfs_inode_item_format( ilf->ilf_boffset = ip->i_imap.im_boffset; ilf->ilf_fields = XFS_ILOG_CORE; ilf->ilf_size = 2; /* format + core */ - xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + /* + * make sure we don't leak uninitialised data into the log in the case + * when we don't log every field in the inode. + */ + ilf->ilf_dsize = 0; + ilf->ilf_asize = 0; + ilf->ilf_pad = 0; + uuid_copy(&ilf->ilf_u.ilfu_uuid, &uuid_null); + + xlog_finish_iovec(lv, vecp, sizeof(*ilf)); xfs_inode_item_format_core(ip, lv, &vecp); xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); @@ -855,44 +873,29 @@ xfs_istale_done( } /* - * convert an xfs_inode_log_format struct from either 32 or 64 bit versions - * (which can have different field alignments) to the native version + * convert an xfs_inode_log_format struct from the old 32 bit version + * (which can have different field alignments) to the native 64 bit version */ int xfs_inode_item_format_convert( - xfs_log_iovec_t *buf, - xfs_inode_log_format_t *in_f) + struct xfs_log_iovec *buf, + struct xfs_inode_log_format *in_f) { - if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { - xfs_inode_log_format_32_t *in_f32 = buf->i_addr; - - in_f->ilf_type = in_f32->ilf_type; - in_f->ilf_size = in_f32->ilf_size; - in_f->ilf_fields = in_f32->ilf_fields; - in_f->ilf_asize = in_f32->ilf_asize; - in_f->ilf_dsize = in_f32->ilf_dsize; - in_f->ilf_ino = in_f32->ilf_ino; - /* copy biggest field of ilf_u */ - uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid); - in_f->ilf_blkno = in_f32->ilf_blkno; - in_f->ilf_len = in_f32->ilf_len; - in_f->ilf_boffset = in_f32->ilf_boffset; - return 0; - } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ - xfs_inode_log_format_64_t *in_f64 = buf->i_addr; - - in_f->ilf_type = in_f64->ilf_type; - in_f->ilf_size = in_f64->ilf_size; - in_f->ilf_fields = in_f64->ilf_fields; - in_f->ilf_asize = in_f64->ilf_asize; - in_f->ilf_dsize = in_f64->ilf_dsize; - in_f->ilf_ino = in_f64->ilf_ino; - /* copy biggest field of ilf_u */ - uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid); - in_f->ilf_blkno = in_f64->ilf_blkno; - in_f->ilf_len = in_f64->ilf_len; - in_f->ilf_boffset = in_f64->ilf_boffset; - return 0; - } - return -EFSCORRUPTED; + struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; + + if (buf->i_len != sizeof(*in_f32)) + return -EFSCORRUPTED; + + in_f->ilf_type = in_f32->ilf_type; + in_f->ilf_size = in_f32->ilf_size; + in_f->ilf_fields = in_f32->ilf_fields; + in_f->ilf_asize = in_f32->ilf_asize; + in_f->ilf_dsize = in_f32->ilf_dsize; + in_f->ilf_ino = in_f32->ilf_ino; + /* copy biggest field of ilf_u */ + uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid); + in_f->ilf_blkno = in_f32->ilf_blkno; + in_f->ilf_len = in_f32->ilf_len; + in_f->ilf_boffset = in_f32->ilf_boffset; + return 0; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c5107c7bc4bf..dc95a49d62e7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2515,7 +2515,7 @@ next_lv: if (lv) vecp = lv->lv_iovecp; } - if (record_cnt == 0 && ordered == false) { + if (record_cnt == 0 && !ordered) { if (!lv) return 0; break; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index ea7d4b4e50d0..e9727d0a541a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -704,7 +704,7 @@ xfs_mountfs( xfs_set_maxicount(mp); /* enable fail_at_unmount as default */ - mp->m_fail_unmount = 1; + mp->m_fail_unmount = true; error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0c381d71b242..0492436a053f 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); - XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56); + XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 3246815c24d6..37e603bf1591 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -736,7 +736,13 @@ xfs_reflink_end_cow( /* If there is a hole at end_fsb - 1 go to the previous extent */ if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) || got.br_startoff > end_fsb) { - ASSERT(idx > 0); + /* + * In case of racing, overlapping AIO writes no COW extents + * might be left by the time I/O completes for the loser of + * the race. In that case we are done. + */ + if (idx <= 0) + goto out_cancel; xfs_iext_get_extent(ifp, --idx, &got); } @@ -809,6 +815,7 @@ next_extent: out_defer: xfs_defer_cancel(&dfops); +out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out: |