diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ceph/addr.c | 9 | ||||
-rw-r--r-- | fs/ceph/file.c | 2 | ||||
-rw-r--r-- | fs/cifs/smb2ops.c | 2 | ||||
-rw-r--r-- | fs/ext2/ext2.h | 11 | ||||
-rw-r--r-- | fs/ext2/file.c | 7 | ||||
-rw-r--r-- | fs/ext2/inode.c | 12 | ||||
-rw-r--r-- | fs/ext2/super.c | 3 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 10 | ||||
-rw-r--r-- | fs/ext4/extents.c | 25 | ||||
-rw-r--r-- | fs/ext4/file.c | 13 | ||||
-rw-r--r-- | fs/ext4/inode.c | 47 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 4 | ||||
-rw-r--r-- | fs/ext4/super.c | 13 | ||||
-rw-r--r-- | fs/ext4/truncate.h | 8 | ||||
-rw-r--r-- | fs/f2fs/data.c | 8 | ||||
-rw-r--r-- | fs/f2fs/f2fs.h | 1 | ||||
-rw-r--r-- | fs/f2fs/file.c | 62 | ||||
-rw-r--r-- | fs/f2fs/super.c | 1 | ||||
-rw-r--r-- | fs/fuse/dax.c | 50 | ||||
-rw-r--r-- | fs/fuse/dir.c | 11 | ||||
-rw-r--r-- | fs/fuse/file.c | 10 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 7 | ||||
-rw-r--r-- | fs/fuse/inode.c | 1 | ||||
-rw-r--r-- | fs/inode.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_bmap_util.c | 15 | ||||
-rw-r--r-- | fs/xfs/xfs_file.c | 13 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 121 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 2 | ||||
-rw-r--r-- | fs/zonefs/super.c | 23 | ||||
-rw-r--r-- | fs/zonefs/zonefs.h | 7 |
31 files changed, 228 insertions, 275 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a1e2813731d1..7e7a897ae0d3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1395,9 +1395,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } else { struct address_space *mapping = inode->i_mapping; - struct page *page = find_or_create_page(mapping, 0, - mapping_gfp_constraint(mapping, - ~__GFP_FS)); + struct page *page; + + filemap_invalidate_lock_shared(mapping); + page = find_or_create_page(mapping, 0, + mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; goto out_inline; @@ -1418,6 +1420,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: + filemap_invalidate_unlock_shared(mapping); dout("filemap_fault %p %llu read inline data ret %x\n", inode, off, ret); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d1755ac1d964..e1d605a02d4a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2088,6 +2088,7 @@ static long ceph_fallocate(struct file *file, int mode, if (ret < 0) goto unlock; + filemap_invalidate_lock(inode->i_mapping); ceph_zero_pagecache_range(inode, offset, length); ret = ceph_zero_objects(inode, offset, length); @@ -2100,6 +2101,7 @@ static long ceph_fallocate(struct file *file, int mode, if (dirty) __mark_inode_dirty(inode, dirty); } + filemap_invalidate_unlock(inode->i_mapping); ceph_put_cap_refs(ci, got); unlock: diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 2dfd0d8297eb..ddc0e8f97872 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3590,6 +3590,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, return rc; } + filemap_invalidate_lock(inode->i_mapping); /* * We implement the punch hole through ioctl, so we need remove the page * caches first, otherwise the data may be inconsistent with the server. @@ -3607,6 +3608,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, sizeof(struct file_zero_data_information), CIFSMaxBufSize, NULL, NULL); free_xid(xid); + filemap_invalidate_unlock(inode->i_mapping); return rc; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e512630cb63e..3be9dd6412b7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -667,9 +667,6 @@ struct ext2_inode_info { struct rw_semaphore xattr_sem; #endif rwlock_t i_meta_lock; -#ifdef CONFIG_FS_DAX - struct rw_semaphore dax_sem; -#endif /* * truncate_mutex is for serialising ext2_truncate() against @@ -685,14 +682,6 @@ struct ext2_inode_info { #endif }; -#ifdef CONFIG_FS_DAX -#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem) -#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem) -#else -#define dax_sem_down_write(ext2_inode) -#define dax_sem_up_write(ext2_inode) -#endif - /* * Inode dynamic state flags */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index f98466acc672..eb97aa3d700e 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -81,7 +81,7 @@ out_unlock: * * mmap_lock (MM) * sb_start_pagefault (vfs, freeze) - * ext2_inode_info->dax_sem + * address_space->invalidate_lock * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX) * ext2_inode_info->truncate_mutex * @@ -91,7 +91,6 @@ out_unlock: static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); vm_fault_t ret; bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); @@ -100,11 +99,11 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } - down_read(&ei->dax_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); - up_read(&ei->dax_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 04f0def0f5eb..333fa62661d5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1178,7 +1178,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de ext2_free_data(inode, p, q); } -/* dax_sem must be held when calling this function */ +/* mapping->invalidate_lock must be held when calling this function */ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) { __le32 *i_data = EXT2_I(inode)->i_data; @@ -1195,7 +1195,7 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); #ifdef CONFIG_FS_DAX - WARN_ON(!rwsem_is_locked(&ei->dax_sem)); + WARN_ON(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)); #endif n = ext2_block_to_path(inode, iblock, offsets, NULL); @@ -1277,9 +1277,9 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset) if (ext2_inode_is_fast_symlink(inode)) return; - dax_sem_down_write(EXT2_I(inode)); + filemap_invalidate_lock(inode->i_mapping); __ext2_truncate_blocks(inode, offset); - dax_sem_up_write(EXT2_I(inode)); + filemap_invalidate_unlock(inode->i_mapping); } static int ext2_setsize(struct inode *inode, loff_t newsize) @@ -1309,10 +1309,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) if (error) return error; - dax_sem_down_write(EXT2_I(inode)); + filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, newsize); __ext2_truncate_blocks(inode, newsize); - dax_sem_up_write(EXT2_I(inode)); + filemap_invalidate_unlock(inode->i_mapping); inode->i_mtime = inode->i_ctime = current_time(inode); if (inode_needs_sync(inode)) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 21e09fbaa46f..987bcf32ed46 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -206,9 +206,6 @@ static void init_once(void *foo) init_rwsem(&ei->xattr_sem); #endif mutex_init(&ei->truncate_mutex); -#ifdef CONFIG_FS_DAX - init_rwsem(&ei->dax_sem); -#endif inode_init_once(&ei->vfs_inode); } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c51e243450d..7ebaf66b6e31 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1086,15 +1086,6 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; - /* - * i_mmap_sem is for serializing page faults with truncate / punch hole - * operations. We have to make sure that new page cannot be faulted in - * a section of the inode that is being punched. We cannot easily use - * i_data_sem for this since we need protection for the whole punch - * operation and i_data_sem ranks below transaction start so we have - * to occasionally drop it. - */ - struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -2972,7 +2963,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); -extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 92ad64b89d9b..c33e0a2cb6c3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4474,6 +4474,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; unsigned int max_blocks; loff_t new_size = 0; @@ -4560,17 +4561,17 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Prevent page faults from reinstantiating pages we have * released from page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); goto out_mutex; } ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); goto out_mutex; } /* Now release the pages and zero block aligned part of pages */ @@ -4579,7 +4580,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) goto out_mutex; } @@ -5221,6 +5222,7 @@ out: static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; @@ -5274,7 +5276,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -5289,15 +5291,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * Write tail of the last page before removed range since it will get * removed from the page cache below. */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + ret = filemap_write_and_wait_range(mapping, ioffset, offset); if (ret) goto out_mmap; /* * Write data that will be shifted to preserve them when discarding * page cache below. We are also protected from pages becoming dirty - * by i_mmap_sem. + * by i_rwsem and invalidate_lock. */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + ret = filemap_write_and_wait_range(mapping, offset + len, LLONG_MAX); if (ret) goto out_mmap; @@ -5350,7 +5352,7 @@ out_stop: ext4_journal_stop(handle); ext4_fc_stop_ineligible(sb); out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; @@ -5367,6 +5369,7 @@ out_mutex: static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; @@ -5425,7 +5428,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -5526,7 +5529,7 @@ out_stop: ext4_journal_stop(handle); ext4_fc_stop_ineligible(sb); out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 816dedcbd541..d3b4ed91aa68 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); + struct address_space *mapping = vmf->vma->vm_file->f_mapping; pfn_t pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); if (write) { @@ -731,10 +732,10 @@ retry: /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, pe_size, pfn); - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); } return result; @@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = { #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = ext4_filemap_fault, + .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d8de607849df..325c038e7b23 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } -static void ext4_wait_dax_page(struct ext4_inode_info *ei) +static void ext4_wait_dax_page(struct inode *inode) { - up_write(&ei->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); schedule(); - down_write(&ei->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { - struct ext4_inode_info *ei = EXT4_I(inode); struct page *page; int error; - if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) + if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; do { @@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode) error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(ei)); + ext4_wait_dax_page(inode)); } while (error == 0); return error; @@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); if (ext4_has_inline_data(inode)) { - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_convert_inline_data(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) return ret; } @@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) out_stop: ext4_journal_stop(handle); out_dio: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; @@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode_dio_wait(inode); } - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); goto err_out; } @@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, error = rc; } out_mmap_sem: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); } if (!error) { @@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) * data (and journalled aops don't know how to handle these cases). */ if (val) { - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return err; } } @@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) percpu_up_write(&sbi->s_writepages_rwsem); if (val) - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ @@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) @@ -6176,7 +6175,7 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(err); out: - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; out_error: @@ -6184,15 +6183,3 @@ out_error: ext4_journal_stop(handle); goto out; } - -vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - vm_fault_t ret; - - down_read(&EXT4_I(inode)->i_mmap_sem); - ret = filemap_fault(vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - - return ret; -} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6eed6170aded..4fb5fe083c2b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb, goto journal_err_out; } - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err) goto err_out; @@ -256,7 +256,7 @@ err_out1: ext4_double_up_write_data_sem(inode, inode_bl); err_out: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dfa09a277b56..d6df62fc810c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -90,12 +90,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, /* * Lock ordering * - * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and - * i_mmap_rwsem (inode->i_mmap_rwsem)! - * * page fault path: - * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> - * page lock -> i_data_sem (rw) + * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start + * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock @@ -103,8 +100,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * i_data_sem (rw) * * truncate: - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start -> + * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> + * page lock + * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: @@ -1360,7 +1358,6 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); - init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index bcbe3668c1d4..ce84aa2786c7 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -11,14 +11,16 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + struct address_space *mapping = inode->i_mapping; + /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ - down_write(&EXT4_I(inode)->i_mmap_sem); - truncate_inode_pages(inode->i_mapping, inode->i_size); + filemap_invalidate_lock(mapping); + truncate_inode_pages(mapping, inode->i_size); ext4_truncate(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d2cf48c5a2e4..eb222b35edef 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3187,12 +3187,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3852,7 +3852,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, int ret = 0; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); @@ -3894,7 +3894,7 @@ done: clear_inode_flag(inode, FI_DO_DEFRAG); clear_inode_flag(inode, FI_ALIGNED_WRITE); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ee8eb33e2c25..906b2c4b50e7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -754,7 +754,6 @@ struct f2fs_inode_info { /* avoid racing between foreground op and gc */ struct rw_semaphore i_gc_rwsem[2]; - struct rw_semaphore i_mmap_sem; struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6afd4562335f..1ff333755721 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -38,10 +38,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); vm_fault_t ret; - down_read(&F2FS_I(inode)->i_mmap_sem); ret = filemap_fault(vmf); - up_read(&F2FS_I(inode)->i_mmap_sem); - if (!ret) f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, F2FS_BLKSIZE); @@ -101,7 +98,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); file_update_time(vmf->vma->vm_file); - down_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || @@ -159,7 +156,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) trace_f2fs_vm_page_mkwrite(page, DATA); out_sem: - up_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); err: @@ -940,7 +937,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, attr->ia_size); @@ -950,7 +947,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * do not trim all blocks after i_size if target size is * larger than i_size. */ - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -1095,7 +1092,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_end = (loff_t)pg_end << PAGE_SHIFT; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -1104,7 +1101,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1339,7 +1336,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1347,7 +1344,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1378,13 +1375,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); new_size = i_size_read(inode) - len; ret = f2fs_truncate_blocks(inode, new_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (!ret) f2fs_i_size_write(inode, new_size); return ret; @@ -1484,7 +1481,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, pgoff_t end; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache_range(inode, (loff_t)index << PAGE_SHIFT, @@ -1496,7 +1493,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1508,7 +1505,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1543,6 +1540,7 @@ out: static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1565,14 +1563,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) return ret; /* write out all dirty pages from offset */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX); if (ret) return ret; @@ -1583,7 +1581,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1599,14 +1597,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); - filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + filemap_invalidate_lock(mapping); + filemap_write_and_wait_range(mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (!ret) f2fs_i_size_write(inode, new_size); @@ -3440,7 +3438,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) goto out; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3476,7 +3474,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); out: inode_unlock(inode); @@ -3593,7 +3591,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3629,7 +3627,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); @@ -3748,7 +3746,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) goto err; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = filemap_write_and_wait_range(mapping, range.start, to_end ? LLONG_MAX : end_addr - 1); @@ -3835,7 +3833,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) ret = f2fs_secure_erase(prev_bdev, inode, prev_index, prev_block, len, range.flags); out: - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); err: inode_unlock(inode); @@ -4313,9 +4311,9 @@ write: /* if we couldn't write data, we should deallocate blocks. */ if (preallocated && i_size_read(inode) < target_size) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); f2fs_truncate(inode); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8fecd3050ccd..ce2ab1b85c11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1289,7 +1289,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->i_gc_rwsem[READ]); init_rwsem(&fi->i_gc_rwsem[WRITE]); - init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); /* Will be used by directory only */ diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 9d58371d22c2..281d79f8b3d3 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, /* * Can't do inline reclaim in fault path. We call * dax_layout_busy_page() before we free a range. And - * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it. - * In fault path we enter with fi->i_mmap_sem held and can't drop - * it. Also in fault path we hold fi->i_mmap_sem shared and not - * exclusive, so that creates further issues with fuse_wait_dax_page(). - * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory - * range to become free and retry. + * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it. + * In fault path we enter with mapping->invalidate_lock held and can't + * drop it. Also in fault path we hold mapping->invalidate_lock shared + * and not exclusive, so that creates further issues with + * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault() + * will wait for a memory range to become free and retry. */ if (flags & IOMAP_FAULT) { alloc_dmap = alloc_dax_mapping(fcd); @@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, down_write(&fi->dax->sem); node = interval_tree_iter_first(&fi->dax->tree, idx, idx); - /* We are holding either inode lock or i_mmap_sem, and that should + /* We are holding either inode lock or invalidate_lock, and that should * ensure that dmap can't be truncated. We are holding a reference * on dmap and that should make sure it can't be reclaimed. So dmap * should still be there in tree despite the fact we dropped and @@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = { static void fuse_wait_dax_page(struct inode *inode) { - struct fuse_inode *fi = get_fuse_inode(inode); - - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); schedule(); - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); } -/* Should be called with fi->i_mmap_sem lock held exclusively */ +/* Should be called with mapping->invalidate_lock held exclusively */ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, loff_t start, loff_t end) { @@ -813,18 +811,18 @@ retry: * we do not want any read/write/mmap to make progress and try * to populate page cache or access memory we are trying to free. */ - down_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); goto retry; } if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(sb); @@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, int ret; struct interval_tree_node *node; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); /* Lookup a dmap and corresponding file offset to reclaim. */ down_read(&fi->dax->sem); @@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, out_write_dmap_sem: up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return dmap; } @@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * had a reference or some other temporary failure, * Try again. We want to give up inline reclaim only * if there is no range assigned to this node. Otherwise - * if a deadlock is possible if we sleep with fi->i_mmap_sem - * held and worker to free memory can't make progress due - * to unavailability of fi->i_mmap_sem lock. So sleep - * only if fi->dax->nr=0 + * if a deadlock is possible if we sleep with + * mapping->invalidate_lock held and worker to free memory + * can't make progress due to unavailability of + * mapping->invalidate_lock. So sleep only if fi->dax->nr=0 */ if (retry) continue; @@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * There are no mappings which can be reclaimed. Wait for one. * We are not holding fi->dax->sem. So it is possible * that range gets added now. But as we are not holding - * fi->i_mmap_sem, worker should still be able to free up - * a range and wake us up. + * mapping->invalidate_lock, worker should still be able to + * free up a range and wake us up. */ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { if (wait_event_killable_exclusive(fcd->range_waitq, @@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, /* * Free a range of memory. * Locking: - * 1. Take fi->i_mmap_sem to block dax faults. + * 1. Take mapping->invalidate_lock to block dax faults. * 2. Take fi->dax->sem to protect interval tree and also to make sure * read/write can not reuse a dmap which we might be freeing. */ @@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); if (ret) { pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", @@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return ret; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index eade6f965b2e..d9b977c0f38d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = inode->i_mapping; FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } if (FUSE_IS_DAX(inode) && is_truncate) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(mapping); fault_blocked = true; err = fuse_dax_break_layouts(inode, 0, 0); if (err) { - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } } @@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); - invalidate_inode_pages2(inode->i_mapping); + invalidate_inode_pages2(mapping); } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); out: if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return 0; @@ -1711,7 +1712,7 @@ error: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 97f860cfc195..621a662c19fb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) } if (dax_truncate) { - down_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out; @@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) out: if (dax_truncate) - up_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (is_wb_truncate | dax_truncate) { fuse_release_nowrite(inode); @@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (lock_inode) { inode_lock(inode); if (block_faults) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out; @@ -2976,7 +2976,7 @@ out: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (block_faults) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (lock_inode) inode_unlock(inode); @@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, * modifications. Yet this does give less guarantees than if the * copying was performed with write(2). * - * To fix this a i_mmap_sem style lock could be used to prevent new + * To fix this a mapping->invalidate_lock could be used to prevent new * faults while the copy is ongoing. */ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 07829ce78695..6fb639b97ea8 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -149,13 +149,6 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; - /** - * Can't take inode lock in fault path (leads to circular dependency). - * Introduce another semaphore which can be taken in fault path and - * then other filesystem paths can take this to block faults. - */ - struct rw_semaphore i_mmap_sem; - #ifdef CONFIG_FUSE_DAX /* * Dax specific inode data diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b9beb39a4a18..e07e429f32e1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); - init_rwsem(&fi->i_mmap_sem); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); if (!fi->forget) diff --git a/fs/inode.c b/fs/inode.c index c93500d84264..84c528cd1955 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -190,6 +190,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; + __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock", + &sb->s_type->invalidate_lock_key); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 213a97a921bb..1cd3f940fa6a 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1626,7 +1626,6 @@ xfs_swap_extents( struct xfs_bstat *sbp = &sxp->sx_stat; int src_log_flags, target_log_flags; int error = 0; - int lock_flags; uint64_t f; int resblks = 0; unsigned int flags = 0; @@ -1638,8 +1637,8 @@ xfs_swap_extents( * do the rest of the checks. */ lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); - lock_flags = XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL); + filemap_invalidate_lock_two(VFS_I(ip)->i_mapping, + VFS_I(tip)->i_mapping); /* Verify that both files have the same format */ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { @@ -1711,7 +1710,6 @@ xfs_swap_extents( * or cancel will unlock the inodes from this point onwards. */ xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; xfs_trans_ijoin(tp, ip, 0); xfs_trans_ijoin(tp, tip, 0); @@ -1830,13 +1828,16 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); +out_unlock_ilock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); out_unlock: - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); + filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping, + VFS_I(tip)->i_mapping); unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); - goto out_unlock; + goto out_unlock_ilock; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cc3cfb12df53..3dfbdcdb0d1c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1302,7 +1302,7 @@ xfs_file_llseek( * * mmap_lock (MM) * sb_start_pagefault(vfs, freeze) - * i_mmaplock (XFS - truncate serialisation) + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ @@ -1323,24 +1323,27 @@ __xfs_filemap_fault( file_update_time(vmf->vma->vm_file); } - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { pfn_t pfn; + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_direct_write_iomap_ops : &xfs_read_iomap_ops); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { - if (write_fault) + if (write_fault) { + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); - else + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + } else { ret = filemap_fault(vmf); + } } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (write_fault) sb_end_pagefault(inode->i_sb); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 990b72ae3635..f00145e1a976 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -132,7 +132,7 @@ xfs_ilock_attr_map_shared( /* * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 - * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * multi-reader locks: invalidate_lock and the i_lock. This routine allows * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, @@ -140,23 +140,23 @@ xfs_ilock_attr_map_shared( * * Basic locking order: * - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> invalidate_lock -> page_lock -> i_ilock * * mmap_lock locking order: * * i_rwsem -> page lock -> mmap_lock - * mmap_lock -> i_mmap_lock -> page_lock + * mmap_lock -> invalidate_lock -> page_lock * * The difference in mmap_lock locking order mean that we cannot hold the - * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can - * fault in pages during copy in/out (for buffered IO) or require the mmap_lock - * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because - * page faults already hold the mmap_lock. + * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths + * can fault in pages during copy in/out (for buffered IO) or require the + * mmap_lock in get_user_pages() to map the user pages into the kernel address + * space for direct IO. Similarly the i_rwsem cannot be taken inside a page + * fault because page faults already hold the mmap_lock. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both - * taken in places where we need to invalidate the page cache in a race + * take both the i_rwsem and the invalidate_lock. These locks should *only* be + * both taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). */ @@ -188,10 +188,13 @@ xfs_ilock( XFS_IOLOCK_DEP(lock_flags)); } - if (lock_flags & XFS_MMAPLOCK_EXCL) - mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_MMAPLOCK_SHARED) - mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); @@ -240,10 +243,10 @@ xfs_ilock_nowait( } if (lock_flags & XFS_MMAPLOCK_EXCL) { - if (!mrtryupdate(&ip->i_mmaplock)) + if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } else if (lock_flags & XFS_MMAPLOCK_SHARED) { - if (!mrtryaccess(&ip->i_mmaplock)) + if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } @@ -258,9 +261,9 @@ xfs_ilock_nowait( out_undo_mmaplock: if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) up_write(&VFS_I(ip)->i_rwsem); @@ -307,9 +310,9 @@ xfs_iunlock( up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); @@ -335,7 +338,7 @@ xfs_ilock_demote( if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrdemote(&ip->i_mmaplock); + downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_IOLOCK_EXCL) downgrade_write(&VFS_I(ip)->i_rwsem); @@ -343,9 +346,29 @@ xfs_ilock_demote( } #if defined(DEBUG) || defined(XFS_WARN) -int +static inline bool +__xfs_rwsem_islocked( + struct rw_semaphore *rwsem, + bool shared) +{ + if (!debug_locks) + return rwsem_is_locked(rwsem); + + if (!shared) + return lockdep_is_held_type(rwsem, 0); + + /* + * We are checking that the lock is held at least in shared + * mode but don't care that it might be held exclusively + * (i.e. shared | excl). Hence we check if the lock is held + * in any mode rather than an explicit shared mode. + */ + return lockdep_is_held_type(rwsem, -1); +} + +bool xfs_isilocked( - xfs_inode_t *ip, + struct xfs_inode *ip, uint lock_flags) { if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { @@ -355,20 +378,17 @@ xfs_isilocked( } if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - if (!(lock_flags & XFS_MMAPLOCK_SHARED)) - return !!ip->i_mmaplock.mr_writer; - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, + (lock_flags & XFS_IOLOCK_SHARED)); } - if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { - if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !debug_locks || - lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); - return rwsem_is_locked(&VFS_I(ip)->i_rwsem); + if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { + return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, + (lock_flags & XFS_IOLOCK_SHARED)); } ASSERT(0); - return 0; + return false; } #endif @@ -532,12 +552,10 @@ again: } /* - * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - - * the mmaplock or the ilock, but not more than one type at a time. If we lock - * more than one at a time, lockdep will report false positives saying we have - * violated locking orders. The iolock must be double-locked separately since - * we use i_rwsem for that. We now support taking one lock EXCL and the other - * SHARED. + * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and + * mmaplock must be double-locked separately since we use i_rwsem and + * invalidate_lock for that. We now support taking one lock EXCL and the + * other SHARED. */ void xfs_lock_two_inodes( @@ -555,15 +573,8 @@ xfs_lock_two_inodes( ASSERT(hweight32(ip1_mode) == 1); ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); - ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - + ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -3741,11 +3752,8 @@ xfs_ilock2_io_mmap( ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); if (ret) return ret; - if (ip1 == ip2) - xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); - else - xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, - ip2, XFS_MMAPLOCK_EXCL); + filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); return 0; } @@ -3755,12 +3763,9 @@ xfs_iunlock2_io_mmap( struct xfs_inode *ip1, struct xfs_inode *ip2) { - bool same_inode = (ip1 == ip2); - - xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); inode_unlock(VFS_I(ip2)); - if (!same_inode) + if (ip1 != ip2) inode_unlock(VFS_I(ip1)); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 4b6703dbffb8..e0ae905554e2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -40,7 +40,6 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ /* @@ -410,7 +409,7 @@ void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); -int xfs_isilocked(xfs_inode_t *, uint); +bool xfs_isilocked(struct xfs_inode *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2c9e26a44546..102cbd606633 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -709,8 +709,6 @@ xfs_fs_inode_init_once( atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); } diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 70055d486bf7..ddc346a9df9b 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -462,7 +462,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) inode_dio_wait(inode); /* Serialize against page faults */ - down_write(&zi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); /* Serialize against zonefs_iomap_begin() */ mutex_lock(&zi->i_truncate_mutex); @@ -500,7 +500,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) unlock: mutex_unlock(&zi->i_truncate_mutex); - up_write(&zi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return ret; } @@ -575,18 +575,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, return ret; } -static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf) -{ - struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file)); - vm_fault_t ret; - - down_read(&zi->i_mmap_sem); - ret = filemap_fault(vmf); - up_read(&zi->i_mmap_sem); - - return ret; -} - static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -607,16 +595,16 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) file_update_time(vmf->vma->vm_file); /* Serialize against truncates */ - down_read(&zi->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); - up_read(&zi->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); return ret; } static const struct vm_operations_struct zonefs_file_vm_ops = { - .fault = zonefs_filemap_fault, + .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = zonefs_filemap_page_mkwrite, }; @@ -1155,7 +1143,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); - init_rwsem(&zi->i_mmap_sem); zi->i_wr_refcnt = 0; return &zi->i_vnode; diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 51141907097c..7b147907c328 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -70,12 +70,11 @@ struct zonefs_inode_info { * and changes to the inode private data, and in particular changes to * a sequential file size on completion of direct IO writes. * Serialization of mmap read IOs with truncate and syscall IO - * operations is done with i_mmap_sem in addition to i_truncate_mutex. - * Only zonefs_seq_file_truncate() takes both lock (i_mmap_sem first, - * i_truncate_mutex second). + * operations is done with invalidate_lock in addition to + * i_truncate_mutex. Only zonefs_seq_file_truncate() takes both lock + * (invalidate_lock first, i_truncate_mutex second). */ struct mutex i_truncate_mutex; - struct rw_semaphore i_mmap_sem; /* guarded by i_truncate_mutex */ unsigned int i_wr_refcnt; |