summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/Kconfig.binfmt8
-rw-r--r--fs/afs/dir.c64
-rw-r--r--fs/afs/dir_silly.c38
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/flock.c6
-rw-r--r--fs/afs/fs_operation.c10
-rw-r--r--fs/afs/inode.c93
-rw-r--r--fs/afs/internal.h38
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/proc.c1
-rw-r--r--fs/afs/vl_alias.c5
-rw-r--r--fs/afs/write.c13
-rw-r--r--fs/afs/yfsclient.c95
-rw-r--r--fs/aio.c7
-rw-r--r--fs/binfmt_elf.c16
-rw-r--r--fs/binfmt_elf_fdpic.c31
-rw-r--r--fs/binfmt_flat.c22
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/btrfs_inode.h18
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/file.c97
-rw-r--r--fs/btrfs/inode.c379
-rw-r--r--fs/cifs/cache.c9
-rw-r--r--fs/cifs/cifs_debug.c4
-rw-r--r--fs/cifs/cifsacl.c79
-rw-r--r--fs/cifs/cifsacl.h15
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/cifsglob.h18
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/dir.c5
-rw-r--r--fs/cifs/file.c5
-rw-r--r--fs/cifs/fscache.c17
-rw-r--r--fs/cifs/fscache.h9
-rw-r--r--fs/cifs/inode.c185
-rw-r--r--fs/cifs/link.c4
-rw-r--r--fs/cifs/smb2glob.h1
-rw-r--r--fs/cifs/smb2inode.c105
-rw-r--r--fs/cifs/smb2pdu.c131
-rw-r--r--fs/cifs/smb2pdu.h27
-rw-r--r--fs/cifs/smb2proto.h6
-rw-r--r--fs/cifs/trace.h3
-rw-r--r--fs/coredump.c8
-rw-r--r--fs/direct-io.c19
-rw-r--r--fs/eventfd.c64
-rw-r--r--fs/exec.c18
-rw-r--r--fs/exfat/Kconfig7
-rw-r--r--fs/exfat/balloc.c8
-rw-r--r--fs/exfat/dir.c222
-rw-r--r--fs/exfat/exfat_fs.h48
-rw-r--r--fs/exfat/exfat_raw.h85
-rw-r--r--fs/exfat/fatent.c17
-rw-r--r--fs/exfat/file.c25
-rw-r--r--fs/exfat/inode.c57
-rw-r--r--fs/exfat/misc.c46
-rw-r--r--fs/exfat/namei.c63
-rw-r--r--fs/exfat/nls.c52
-rw-r--r--fs/exfat/super.c262
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext4/Kconfig3
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h27
-rw-r--r--fs/ext4/extents.c2
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c26
-rw-r--r--fs/ext4/ioctl.c65
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/super.c130
-rw-r--r--fs/ext4/verity.c5
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/ext4/xattr_hurd.c51
-rw-r--r--fs/f2fs/Kconfig10
-rw-r--r--fs/f2fs/acl.h2
-rw-r--r--fs/f2fs/checkpoint.c37
-rw-r--r--fs/f2fs/compress.c182
-rw-r--r--fs/f2fs/data.c163
-rw-r--r--fs/f2fs/dir.c374
-rw-r--r--fs/f2fs/f2fs.h171
-rw-r--r--fs/f2fs/file.c401
-rw-r--r--fs/f2fs/gc.c125
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/hash.c76
-rw-r--r--fs/f2fs/inline.c49
-rw-r--r--fs/f2fs/namei.c19
-rw-r--r--fs/f2fs/node.c101
-rw-r--r--fs/f2fs/node.h5
-rw-r--r--fs/f2fs/recovery.c51
-rw-r--r--fs/f2fs/segment.c40
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/super.c88
-rw-r--r--fs/f2fs/sysfs.c97
-rw-r--r--fs/f2fs/trace.h2
-rw-r--r--fs/f2fs/xattr.h8
-rw-r--r--fs/fuse/dev.c14
-rw-r--r--fs/fuse/dir.c12
-rw-r--r--fs/fuse/file.c120
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c26
-rw-r--r--fs/fuse/virtio_fs.c115
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/io-wq.c25
-rw-r--r--fs/io-wq.h8
-rw-r--r--fs/io_uring.c437
-rw-r--r--fs/iomap/buffered-io.c2
-rw-r--r--fs/jbd2/journal.c17
-rw-r--r--fs/jffs2/nodelist.h2
-rw-r--r--fs/jffs2/summary.h4
-rw-r--r--fs/kernfs/file.c4
-rw-r--r--fs/locks.c3
-rw-r--r--fs/namespace.c21
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/dns_resolve.c1
-rw-r--r--fs/nfs/inode.c14
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs4proc.c2
-rw-r--r--fs/nfs/nfstrace.h106
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/sysfs.h2
-rw-r--r--fs/nfsd/cache.h2
-rw-r--r--fs/nfsd/netns.h1
-rw-r--r--fs/nfsd/nfs4callback.c39
-rw-r--r--fs/nfsd/nfs4proc.c9
-rw-r--r--fs/nfsd/nfs4state.c166
-rw-r--r--fs/nfsd/nfscache.c89
-rw-r--r--fs/nfsd/nfsctl.c32
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfssvc.c6
-rw-r--r--fs/nfsd/state.h7
-rw-r--r--fs/nfsd/trace.h345
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/nls/Kconfig32
-rw-r--r--fs/notify/fanotify/Kconfig4
-rw-r--r--fs/notify/inotify/Kconfig2
-rw-r--r--fs/ocfs2/Kconfig2
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/overlayfs/copy_up.c9
-rw-r--r--fs/overlayfs/dir.c51
-rw-r--r--fs/overlayfs/export.c24
-rw-r--r--fs/overlayfs/file.c28
-rw-r--r--fs/overlayfs/inode.c17
-rw-r--r--fs/overlayfs/namei.c138
-rw-r--r--fs/overlayfs/overlayfs.h11
-rw-r--r--fs/overlayfs/ovl_entry.h10
-rw-r--r--fs/overlayfs/readdir.c57
-rw-r--r--fs/overlayfs/super.c243
-rw-r--r--fs/overlayfs/util.c36
-rw-r--r--fs/pipe.c242
-rw-r--r--fs/proc/Kconfig2
-rw-r--r--fs/proc/array.c1
-rw-r--r--fs/proc/base.c24
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/meminfo.c1
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/root.c10
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/task_mmu.c34
-rw-r--r--fs/proc/task_nommu.c18
-rw-r--r--fs/proc/thread_self.c2
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/romfs/Kconfig2
-rw-r--r--fs/select.c112
-rw-r--r--fs/splice.c12
-rw-r--r--fs/squashfs/squashfs_fs.h16
-rw-r--r--fs/super.c2
-rw-r--r--fs/sync.c3
-rw-r--r--fs/userfaultfd.c46
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/xfs/xfs_inode.c18
-rw-r--r--fs/xfs/xfs_ioctl.c108
-rw-r--r--fs/xfs/xfs_iops.c4
174 files changed, 5289 insertions, 2620 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index d1ad3935fb85..a88aa3af73c1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -229,7 +229,7 @@ endmenu
menuconfig MISC_FILESYSTEMS
bool "Miscellaneous filesystems"
default y
- ---help---
+ help
Say Y here to get to see options for various miscellaneous
filesystems, such as filesystems that came from other
operating systems.
@@ -274,7 +274,7 @@ menuconfig NETWORK_FILESYSTEMS
bool "Network File Systems"
default y
depends on NET
- ---help---
+ help
Say Y here to get to see options for network filesystems and
filesystem-related networking code, such as NFS daemon and
RPCSEC security modules.
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 8cd471da3255..885da6d983b4 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -7,7 +7,7 @@ config BINFMT_ELF
depends on MMU
select ELFCORE
default y
- ---help---
+ help
ELF (Executable and Linkable Format) is a format for libraries and
executables used across different architectures and operating
systems. Saying Y here will enable your kernel to run ELF binaries
@@ -138,7 +138,7 @@ config HAVE_AOUT
config BINFMT_AOUT
tristate "Kernel support for a.out and ECOFF binaries"
depends on HAVE_AOUT
- ---help---
+ help
A.out (Assembler.OUTput) is a set of formats for libraries and
executables used in the earliest versions of UNIX. Linux used
the a.out formats QMAGIC and ZMAGIC until they were replaced
@@ -168,7 +168,7 @@ config OSF4_COMPAT
config BINFMT_EM86
tristate "Kernel support for Linux/Intel ELF binaries"
depends on ALPHA
- ---help---
+ help
Say Y here if you want to be able to execute Linux/Intel ELF
binaries just like native Alpha binaries on your Alpha machine. For
this to work, you need to have the emulator /usr/bin/em86 in place.
@@ -182,7 +182,7 @@ config BINFMT_EM86
config BINFMT_MISC
tristate "Kernel support for MISC binaries"
- ---help---
+ help
If you say Y here, it will be possible to plug wrapper-driven binary
formats into the kernel. You will like this especially when you use
programs that need an interpreter to run like Java, Python, .NET or
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 25cbe0aeeec5..3e3c2bf0a722 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -648,7 +648,7 @@ static void afs_do_lookup_success(struct afs_operation *op)
vp = &op->file[0];
abort_code = vp->scb.status.abort_code;
if (abort_code != 0) {
- op->abort_code = abort_code;
+ op->ac.abort_code = abort_code;
op->error = afs_abort_to_error(abort_code);
}
break;
@@ -696,10 +696,11 @@ static const struct afs_operation_ops afs_inline_bulk_status_operation = {
.success = afs_do_lookup_success,
};
-static const struct afs_operation_ops afs_fetch_status_operation = {
+static const struct afs_operation_ops afs_lookup_fetch_status_operation = {
.issue_afs_rpc = afs_fs_fetch_status,
.issue_yfs_rpc = yfs_fs_fetch_status,
.success = afs_do_lookup_success,
+ .aborted = afs_check_for_remote_deletion,
};
/*
@@ -980,7 +981,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
if (!IS_ERR_OR_NULL(inode))
fid = AFS_FS_I(inode)->fid;
- _debug("splice %px", dentry->d_inode);
+ _debug("splice %p", dentry->d_inode);
d = d_splice_alias(inode, dentry);
if (!IS_ERR_OR_NULL(d)) {
d->d_fsdata = dentry->d_fsdata;
@@ -1236,6 +1237,17 @@ void afs_d_release(struct dentry *dentry)
_enter("%pd", dentry);
}
+void afs_check_for_remote_deletion(struct afs_operation *op)
+{
+ struct afs_vnode *vnode = op->file[0].vnode;
+
+ switch (op->ac.abort_code) {
+ case VNOVNODE:
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ afs_break_callback(vnode, afs_cb_break_for_deleted);
+ }
+}
+
/*
* Create a new inode for create/mkdir/symlink
*/
@@ -1268,7 +1280,7 @@ static void afs_vnode_new_inode(struct afs_operation *op)
static void afs_create_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, op->file[0].vnode);
+ op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
afs_update_dentry_version(op, &op->file[0], op->dentry);
afs_vnode_new_inode(op);
@@ -1302,6 +1314,7 @@ static const struct afs_operation_ops afs_mkdir_operation = {
.issue_afs_rpc = afs_fs_make_dir,
.issue_yfs_rpc = yfs_fs_make_dir,
.success = afs_create_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_create_edit_dir,
.put = afs_create_put,
};
@@ -1325,6 +1338,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
op->dentry = dentry;
op->create.mode = S_IFDIR | mode;
op->create.reason = afs_edit_dir_for_mkdir;
@@ -1350,7 +1364,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
static void afs_rmdir_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, op->file[0].vnode);
+ op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
afs_update_dentry_version(op, &op->file[0], op->dentry);
}
@@ -1382,6 +1396,7 @@ static const struct afs_operation_ops afs_rmdir_operation = {
.issue_afs_rpc = afs_fs_remove_dir,
.issue_yfs_rpc = yfs_fs_remove_dir,
.success = afs_rmdir_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_rmdir_edit_dir,
.put = afs_rmdir_put,
};
@@ -1404,6 +1419,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
op->dentry = dentry;
op->ops = &afs_rmdir_operation;
@@ -1479,7 +1495,8 @@ static void afs_dir_remove_link(struct afs_operation *op)
static void afs_unlink_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, op->file[0].vnode);
+ op->ctime = op->file[0].scb.status.mtime_client;
+ afs_check_dir_conflict(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[1]);
afs_update_dentry_version(op, &op->file[0], op->dentry);
@@ -1511,6 +1528,7 @@ static const struct afs_operation_ops afs_unlink_operation = {
.issue_afs_rpc = afs_fs_remove_file,
.issue_yfs_rpc = yfs_fs_remove_file,
.success = afs_unlink_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_unlink_edit_dir,
.put = afs_unlink_put,
};
@@ -1537,6 +1555,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
/* Try to make sure we have a callback promise on the victim. */
ret = afs_validate(vnode, op->key);
@@ -1561,9 +1580,25 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
op->file[1].vnode = vnode;
+ op->file[1].update_ctime = true;
+ op->file[1].op_unlinked = true;
op->dentry = dentry;
op->ops = &afs_unlink_operation;
- return afs_do_sync_operation(op);
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+
+ /* If there was a conflict with a third party, check the status of the
+ * unlinked vnode.
+ */
+ if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ op->file[1].update_ctime = false;
+ op->fetch_status.which = 1;
+ op->ops = &afs_fetch_status_operation;
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+ }
+
+ return afs_put_operation(op);
error:
return afs_put_operation(op);
@@ -1573,6 +1608,7 @@ static const struct afs_operation_ops afs_create_operation = {
.issue_afs_rpc = afs_fs_create_file,
.issue_yfs_rpc = yfs_fs_create_file,
.success = afs_create_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_create_edit_dir,
.put = afs_create_put,
};
@@ -1601,6 +1637,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
op->dentry = dentry;
op->create.mode = S_IFREG | mode;
@@ -1620,6 +1657,7 @@ static void afs_link_success(struct afs_operation *op)
struct afs_vnode_param *vp = &op->file[1];
_enter("op=%08x", op->debug_id);
+ op->ctime = dvp->scb.status.mtime_client;
afs_vnode_commit_status(op, dvp);
afs_vnode_commit_status(op, vp);
afs_update_dentry_version(op, dvp, op->dentry);
@@ -1640,6 +1678,7 @@ static const struct afs_operation_ops afs_link_operation = {
.issue_afs_rpc = afs_fs_link,
.issue_yfs_rpc = yfs_fs_link,
.success = afs_link_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_create_edit_dir,
.put = afs_link_put,
};
@@ -1672,6 +1711,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, vnode);
op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
+ op->file[1].update_ctime = true;
op->dentry = dentry;
op->dentry_2 = from;
@@ -1689,6 +1730,7 @@ static const struct afs_operation_ops afs_symlink_operation = {
.issue_afs_rpc = afs_fs_symlink,
.issue_yfs_rpc = yfs_fs_symlink,
.success = afs_create_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_create_edit_dir,
.put = afs_create_put,
};
@@ -1740,9 +1782,13 @@ static void afs_rename_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
+ op->ctime = op->file[0].scb.status.mtime_client;
+ afs_check_dir_conflict(op, &op->file[1]);
afs_vnode_commit_status(op, &op->file[0]);
- if (op->file[1].vnode != op->file[0].vnode)
+ if (op->file[1].vnode != op->file[0].vnode) {
+ op->ctime = op->file[1].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[1]);
+ }
}
static void afs_rename_edit_dir(struct afs_operation *op)
@@ -1860,6 +1906,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
op->file[0].dv_delta = 1;
op->file[1].dv_delta = 1;
+ op->file[0].update_ctime = true;
+ op->file[1].update_ctime = true;
op->dentry = old_dentry;
op->dentry_2 = new_dentry;
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index b14e3d9a25e2..04f75a44f243 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -16,6 +16,7 @@ static void afs_silly_rename_success(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
+ afs_check_dir_conflict(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[0]);
}
@@ -69,6 +70,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
return PTR_ERR(op);
afs_op_set_vnode(op, 0, dvnode);
+ afs_op_set_vnode(op, 1, dvnode);
+ op->file[0].dv_delta = 1;
+ op->file[1].dv_delta = 1;
+ op->file[0].update_ctime = true;
+ op->file[1].update_ctime = true;
op->dentry = old;
op->dentry_2 = new;
@@ -129,6 +135,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
switch (ret) {
case 0:
/* The rename succeeded. */
+ set_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags);
d_move(dentry, sdentry);
break;
case -ERESTARTSYS:
@@ -148,19 +155,11 @@ out:
static void afs_silly_unlink_success(struct afs_operation *op)
{
- struct afs_vnode *vnode = op->file[1].vnode;
-
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, op->file[0].vnode);
+ afs_check_dir_conflict(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[0]);
afs_vnode_commit_status(op, &op->file[1]);
afs_update_dentry_version(op, &op->file[0], op->dentry);
-
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
- }
}
static void afs_silly_unlink_edit_dir(struct afs_operation *op)
@@ -181,6 +180,7 @@ static const struct afs_operation_ops afs_silly_unlink_operation = {
.issue_afs_rpc = afs_fs_remove_file,
.issue_yfs_rpc = yfs_fs_remove_file,
.success = afs_silly_unlink_success,
+ .aborted = afs_check_for_remote_deletion,
.edit_dir = afs_silly_unlink_edit_dir,
};
@@ -200,12 +200,30 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, vnode);
+ op->file[0].dv_delta = 1;
+ op->file[0].update_ctime = true;
+ op->file[1].op_unlinked = true;
+ op->file[1].update_ctime = true;
op->dentry = dentry;
op->ops = &afs_silly_unlink_operation;
trace_afs_silly_rename(vnode, true);
- return afs_do_sync_operation(op);
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+
+ /* If there was a conflict with a third party, check the status of the
+ * unlinked vnode.
+ */
+ if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ op->file[1].update_ctime = false;
+ op->fetch_status.which = 1;
+ op->ops = &afs_fetch_status_operation;
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+ }
+
+ return afs_put_operation(op);
}
/*
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 506c47471b42..6f6ed1605cfe 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -225,7 +225,6 @@ static void afs_fetch_data_success(struct afs_operation *op)
struct afs_vnode *vnode = op->file[0].vnode;
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, vnode);
afs_vnode_commit_status(op, &op->file[0]);
afs_stat_v(vnode, n_fetches);
atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes);
@@ -240,6 +239,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
.issue_afs_rpc = afs_fs_fetch_data,
.issue_yfs_rpc = yfs_fs_fetch_data,
.success = afs_fetch_data_success,
+ .aborted = afs_check_for_remote_deletion,
.put = afs_fetch_data_put,
};
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 70e518f7bc19..ffb8575345ca 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -71,7 +71,7 @@ static void afs_schedule_lock_extension(struct afs_vnode *vnode)
void afs_lock_op_done(struct afs_call *call)
{
struct afs_operation *op = call->op;
- struct afs_vnode *vnode = op->lock.lvnode;
+ struct afs_vnode *vnode = op->file[0].vnode;
if (call->error == 0) {
spin_lock(&vnode->lock);
@@ -175,10 +175,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode)
static void afs_lock_success(struct afs_operation *op)
{
- struct afs_vnode *vnode = op->file[0].vnode;
-
_enter("op=%08x", op->debug_id);
- afs_check_for_remote_deletion(op, vnode);
afs_vnode_commit_status(op, &op->file[0]);
}
@@ -186,6 +183,7 @@ static const struct afs_operation_ops afs_set_lock_operation = {
.issue_afs_rpc = afs_fs_set_lock,
.issue_yfs_rpc = yfs_fs_set_lock,
.success = afs_lock_success,
+ .aborted = afs_check_for_remote_deletion,
};
/*
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 2d2dff5688a4..c264839b2fd0 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -187,9 +187,17 @@ void afs_wait_for_operation(struct afs_operation *op)
op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
}
- if (op->error == 0) {
+ switch (op->error) {
+ case 0:
_debug("success");
op->ops->success(op);
+ break;
+ case -ECONNABORTED:
+ if (op->ops->aborted)
+ op->ops->aborted(op);
+ break;
+ default:
+ break;
}
afs_end_vnode_operation(op);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 7dde703df40c..1d13d2e882ad 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -165,9 +165,11 @@ static void afs_apply_status(struct afs_operation *op,
{
struct afs_file_status *status = &vp->scb.status;
struct afs_vnode *vnode = vp->vnode;
+ struct inode *inode = &vnode->vfs_inode;
struct timespec64 t;
umode_t mode;
bool data_changed = false;
+ bool change_size = vp->set_size;
_enter("{%llx:%llu.%u} %s",
vp->fid.vid, vp->fid.vnode, vp->fid.unique,
@@ -186,25 +188,25 @@ static void afs_apply_status(struct afs_operation *op,
}
if (status->nlink != vnode->status.nlink)
- set_nlink(&vnode->vfs_inode, status->nlink);
+ set_nlink(inode, status->nlink);
if (status->owner != vnode->status.owner)
- vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner);
+ inode->i_uid = make_kuid(&init_user_ns, status->owner);
if (status->group != vnode->status.group)
- vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group);
+ inode->i_gid = make_kgid(&init_user_ns, status->group);
if (status->mode != vnode->status.mode) {
- mode = vnode->vfs_inode.i_mode;
+ mode = inode->i_mode;
mode &= ~S_IALLUGO;
mode |= status->mode;
- WRITE_ONCE(vnode->vfs_inode.i_mode, mode);
+ WRITE_ONCE(inode->i_mode, mode);
}
t = status->mtime_client;
- vnode->vfs_inode.i_ctime = t;
- vnode->vfs_inode.i_mtime = t;
- vnode->vfs_inode.i_atime = t;
+ inode->i_mtime = t;
+ if (vp->update_ctime)
+ inode->i_ctime = op->ctime;
if (vnode->status.data_version != status->data_version)
data_changed = true;
@@ -226,6 +228,7 @@ static void afs_apply_status(struct afs_operation *op,
} else {
set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
}
+ change_size = true;
} else if (vnode->status.type == AFS_FTYPE_DIR) {
/* Expected directory change is handled elsewhere so
* that we can locally edit the directory and save on a
@@ -233,11 +236,22 @@ static void afs_apply_status(struct afs_operation *op,
*/
if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
data_changed = false;
+ change_size = true;
}
if (data_changed) {
- inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
- afs_set_i_size(vnode, status->size);
+ inode_set_iversion_raw(inode, status->data_version);
+
+ /* Only update the size if the data version jumped. If the
+ * file is being modified locally, then we might have our own
+ * idea of what the size should be that's not the same as
+ * what's on the server.
+ */
+ if (change_size) {
+ afs_set_i_size(vnode, status->size);
+ inode->i_ctime = t;
+ inode->i_atime = t;
+ }
}
}
@@ -267,32 +281,39 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
_enter("");
- ASSERTCMP(op->error, ==, 0);
-
write_seqlock(&vnode->cb_lock);
if (vp->scb.have_error) {
+ /* A YFS server will return this from RemoveFile2 and AFS and
+ * YFS will return this from InlineBulkStatus.
+ */
if (vp->scb.status.abort_code == VNOVNODE) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
clear_nlink(&vnode->vfs_inode);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
+ op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
}
- } else {
- if (vp->scb.have_status)
- afs_apply_status(op, vp);
+ } else if (vp->scb.have_status) {
+ afs_apply_status(op, vp);
if (vp->scb.have_cb)
afs_apply_callback(op, vp);
+ } else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ drop_nlink(&vnode->vfs_inode);
+ if (vnode->vfs_inode.i_nlink == 0) {
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ __afs_break_callback(vnode, afs_cb_break_for_deleted);
+ }
}
write_sequnlock(&vnode->cb_lock);
- if (op->error == 0 && vp->scb.have_status)
+ if (vp->scb.have_status)
afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb);
}
static void afs_fetch_status_success(struct afs_operation *op)
{
- struct afs_vnode_param *vp = &op->file[0];
+ struct afs_vnode_param *vp = &op->file[op->fetch_status.which];
struct afs_vnode *vnode = vp->vnode;
int ret;
@@ -306,10 +327,11 @@ static void afs_fetch_status_success(struct afs_operation *op)
}
}
-static const struct afs_operation_ops afs_fetch_status_operation = {
+const struct afs_operation_ops afs_fetch_status_operation = {
.issue_afs_rpc = afs_fs_fetch_status,
.issue_yfs_rpc = yfs_fs_fetch_status,
.success = afs_fetch_status_success,
+ .aborted = afs_check_for_remote_deletion,
};
/*
@@ -538,7 +560,7 @@ error:
* mark the data attached to an inode as obsolete due to a write on the server
* - might also want to ditch all the outstanding writes and dirty pages
*/
-void afs_zap_data(struct afs_vnode *vnode)
+static void afs_zap_data(struct afs_vnode *vnode)
{
_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
@@ -716,6 +738,9 @@ int afs_getattr(const struct path *path, struct kstat *stat,
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
generic_fillattr(inode, stat);
+ if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
+ stat->nlink > 0)
+ stat->nlink -= 1;
} while (need_seqretry(&vnode->cb_lock, seq));
done_seqretry(&vnode->cb_lock, seq);
@@ -785,7 +810,15 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
+ struct inode *inode = &op->file[0].vnode->vfs_inode;
+
afs_vnode_commit_status(op, &op->file[0]);
+ if (op->setattr.attr->ia_valid & ATTR_SIZE) {
+ loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size;
+ if (size > i_size)
+ pagecache_isize_extended(inode, i_size, size);
+ truncate_pagecache(inode, size);
+ }
}
static const struct afs_operation_ops afs_setattr_operation = {
@@ -801,17 +834,31 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct afs_operation *op;
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+ int ret;
_enter("{%llx:%llu},{n=%pd},%x",
vnode->fid.vid, vnode->fid.vnode, dentry,
attr->ia_valid);
if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
- ATTR_MTIME))) {
+ ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET |
+ ATTR_TOUCH))) {
_leave(" = 0 [unsupported]");
return 0;
}
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (!S_ISREG(vnode->vfs_inode.i_mode))
+ return -EISDIR;
+
+ ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size);
+ if (ret)
+ return ret;
+
+ if (attr->ia_size == i_size_read(&vnode->vfs_inode))
+ attr->ia_valid &= ~ATTR_SIZE;
+ }
+
/* flush any dirty data outstanding on a regular file */
if (S_ISREG(vnode->vfs_inode.i_mode))
filemap_write_and_wait(vnode->vfs_inode.i_mapping);
@@ -825,8 +872,12 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
afs_op_set_vnode(op, 0, vnode);
op->setattr.attr = attr;
- if (attr->ia_valid & ATTR_SIZE)
+ if (attr->ia_valid & ATTR_SIZE) {
op->file[0].dv_delta = 1;
+ op->file[0].set_size = true;
+ }
+ op->ctime = attr->ia_ctime;
+ op->file[0].update_ctime = 1;
op->ops = &afs_setattr_operation;
return afs_do_sync_operation(op);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index e1621b0670cc..573a5922c3bb 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -634,6 +634,7 @@ struct afs_vnode {
#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */
#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */
#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
+#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */
struct list_head wb_keys; /* List of keys available for writeback */
struct list_head pending_locks; /* locks waiting to be granted */
@@ -744,8 +745,11 @@ struct afs_vnode_param {
afs_dataversion_t dv_before; /* Data version before the call */
unsigned int cb_break_before; /* cb_break + cb_s_break before the call */
u8 dv_delta; /* Expected change in data version */
- bool put_vnode; /* T if we have a ref on the vnode */
- bool need_io_lock; /* T if we need the I/O lock on this */
+ bool put_vnode:1; /* T if we have a ref on the vnode */
+ bool need_io_lock:1; /* T if we need the I/O lock on this */
+ bool update_ctime:1; /* Need to update the ctime */
+ bool set_size:1; /* Must update i_size */
+ bool op_unlinked:1; /* True if file was unlinked by op */
};
/*
@@ -766,9 +770,9 @@ struct afs_operation {
struct dentry *dentry; /* Dentry to be altered */
struct dentry *dentry_2; /* Second dentry to be altered */
struct timespec64 mtime; /* Modification time to record */
+ struct timespec64 ctime; /* Change time to set */
short nr_files; /* Number of entries in file[], more_files */
short error;
- unsigned int abort_code;
unsigned int debug_id;
unsigned int cb_v_break; /* Volume break counter before op */
@@ -795,7 +799,6 @@ struct afs_operation {
struct afs_read *req;
} fetch;
struct {
- struct afs_vnode *lvnode; /* vnode being locked */
afs_lock_type_t type;
} lock;
struct {
@@ -838,6 +841,7 @@ struct afs_operation {
#define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */
#define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */
#define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */
+#define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */
};
/*
@@ -933,6 +937,7 @@ extern const struct address_space_operations afs_dir_aops;
extern const struct dentry_operations afs_fs_dentry_operations;
extern void afs_d_release(struct dentry *);
+extern void afs_check_for_remote_deletion(struct afs_operation *);
/*
* dir_edit.c
@@ -1064,13 +1069,14 @@ extern int afs_wait_for_one_fs_probe(struct afs_server *, bool);
/*
* inode.c
*/
+extern const struct afs_operation_ops afs_fetch_status_operation;
+
extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
extern int afs_ilookup5_test_by_fid(struct inode *, void *);
extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
-extern void afs_zap_data(struct afs_vnode *);
extern bool afs_check_validity(struct afs_vnode *);
extern int afs_validate(struct afs_vnode *, struct key *);
extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int);
@@ -1437,7 +1443,6 @@ extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
/*
* yfsclient.c
*/
-extern void yfs_fs_fetch_file_status(struct afs_operation *);
extern void yfs_fs_fetch_data(struct afs_operation *);
extern void yfs_fs_create_file(struct afs_operation *);
extern void yfs_fs_make_dir(struct afs_operation *);
@@ -1483,15 +1488,6 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
return &vnode->vfs_inode;
}
-static inline void afs_check_for_remote_deletion(struct afs_operation *op,
- struct afs_vnode *vnode)
-{
- if (op->error == -ENOENT) {
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- afs_break_callback(vnode, afs_cb_break_for_deleted);
- }
-}
-
/*
* Note that a dentry got changed. We need to set d_fsdata to the data version
* number derived from the result of the operation. It doesn't matter if
@@ -1506,6 +1502,18 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
(void *)(unsigned long)dir_vp->scb.status.data_version;
}
+/*
+ * Check for a conflicting operation on a directory that we just unlinked from.
+ * If someone managed to sneak a link or an unlink in on the file we just
+ * unlinked, we won't be able to trust nlink on an AFS file (but not YFS).
+ */
+static inline void afs_check_dir_conflict(struct afs_operation *op,
+ struct afs_vnode_param *dvp)
+{
+ if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version)
+ op->flags |= AFS_OPERATION_DIR_CONFLICT;
+}
+
static inline int afs_io_error(struct afs_call *call, enum afs_io_error where)
{
trace_afs_io_error(call->debug_id, -EIO, where);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 52b19e9c1535..5334f1bd2bca 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -83,6 +83,7 @@ int afs_abort_to_error(u32 abort_code)
case UAENOLCK: return -ENOLCK;
case UAENOTEMPTY: return -ENOTEMPTY;
case UAELOOP: return -ELOOP;
+ case UAEOVERFLOW: return -EOVERFLOW;
case UAENOMEDIUM: return -ENOMEDIUM;
case UAEDQUOT: return -EDQUOT;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 22d00cf1913d..e817fc740ba0 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -567,6 +567,7 @@ void afs_put_sysnames(struct afs_sysnames *sysnames)
if (sysnames->subs[i] != afs_init_sysname &&
sysnames->subs[i] != sysnames->blank)
kfree(sysnames->subs[i]);
+ kfree(sysnames);
}
}
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index 093895c49c21..5082ef04e99c 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -28,7 +28,7 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k
};
volume = afs_create_volume(&fc);
- _leave(" = %px", volume);
+ _leave(" = %p", volume);
return volume;
}
@@ -73,7 +73,8 @@ static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a,
}
default:
- BUG();
+ WARN_ON(1);
+ diff = 1;
}
out:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 97bccde3298b..7437806332d9 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -194,11 +194,11 @@ int afs_write_end(struct file *file, struct address_space *mapping,
i_size = i_size_read(&vnode->vfs_inode);
if (maybe_i_size > i_size) {
- spin_lock(&vnode->wb_lock);
+ write_seqlock(&vnode->cb_lock);
i_size = i_size_read(&vnode->vfs_inode);
if (maybe_i_size > i_size)
i_size_write(&vnode->vfs_inode, maybe_i_size);
- spin_unlock(&vnode->wb_lock);
+ write_sequnlock(&vnode->cb_lock);
}
if (!PageUptodate(page)) {
@@ -393,6 +393,7 @@ static void afs_store_data_success(struct afs_operation *op)
{
struct afs_vnode *vnode = op->file[0].vnode;
+ op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
if (op->error == 0) {
afs_pages_written_back(vnode, op->store.first, op->store.last);
@@ -447,6 +448,7 @@ static int afs_store_data(struct address_space *mapping,
op->store.last = last;
op->store.first_offset = offset;
op->store.last_to = to;
+ op->mtime = vnode->vfs_inode.i_mtime;
op->ops = &afs_store_data_operation;
try_next_key:
@@ -490,6 +492,7 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
unsigned long count, priv;
unsigned n, offset, to, f, t;
pgoff_t start, first, last;
+ loff_t i_size, end;
int loop, ret;
_enter(",%lx", primary_page->index);
@@ -590,7 +593,12 @@ no_more:
first = primary_page->index;
last = first + count - 1;
+ end = (loff_t)last * PAGE_SIZE + to;
+ i_size = i_size_read(&vnode->vfs_inode);
+
_debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
+ if (end > i_size)
+ to = i_size & ~PAGE_MASK;
ret = afs_store_data(mapping, first, last, offset, to);
switch (ret) {
@@ -843,6 +851,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
vmf->page->index, priv);
SetPagePrivate(vmf->page);
set_page_private(vmf->page, priv);
+ file_update_time(file);
sb_end_pagefault(inode->i_sb);
return VM_FAULT_LOCKED;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index b0a6e40b4da3..8c24fdc899e3 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -15,8 +15,6 @@
#include "xdr_fs.h"
#include "protocol_yfs.h"
-static const struct afs_fid afs_zero_fid;
-
#define xdr_size(x) (sizeof(*x) / sizeof(__be32))
static void xdr_decode_YFSFid(const __be32 **_bp, struct afs_fid *fid)
@@ -332,29 +330,6 @@ static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp,
}
/*
- * Deliver a reply that's a status, callback and volsync.
- */
-static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call)
-{
- struct afs_operation *op = call->op;
- const __be32 *bp;
- int ret;
-
- ret = afs_transfer_reply(call);
- if (ret < 0)
- return ret;
-
- /* unmarshall the reply once we've received all of it */
- bp = call->buffer;
- xdr_decode_YFSFetchStatus(&bp, call, &op->file[0].scb);
- xdr_decode_YFSCallBack(&bp, call, &op->file[0].scb);
- xdr_decode_YFSVolSync(&bp, &op->volsync);
-
- _leave(" = 0 [done]");
- return 0;
-}
-
-/*
* Deliver reply data to operations that just return a file status and a volume
* sync record.
*/
@@ -377,48 +352,6 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call)
}
/*
- * YFS.FetchStatus operation type
- */
-static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = {
- .name = "YFS.FetchStatus(vnode)",
- .op = yfs_FS_FetchStatus,
- .deliver = yfs_deliver_fs_status_cb_and_volsync,
- .destructor = afs_flat_call_destructor,
-};
-
-/*
- * Fetch the status information for a file.
- */
-void yfs_fs_fetch_file_status(struct afs_operation *op)
-{
- struct afs_vnode_param *vp = &op->file[0];
- struct afs_call *call;
- __be32 *bp;
-
- _enter(",%x,{%llx:%llu},,",
- key_serial(op->key), vp->fid.vid, vp->fid.vnode);
-
- call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchStatus_vnode,
- sizeof(__be32) * 2 +
- sizeof(struct yfs_xdr_YFSFid),
- sizeof(struct yfs_xdr_YFSFetchStatus) +
- sizeof(struct yfs_xdr_YFSCallBack) +
- sizeof(struct yfs_xdr_YFSVolSync));
- if (!call)
- return afs_op_nomem(op);
-
- /* marshall the parameters */
- bp = call->request;
- bp = xdr_encode_u32(bp, YFSFETCHSTATUS);
- bp = xdr_encode_u32(bp, 0); /* RPC flags */
- bp = xdr_encode_YFSFid(bp, &vp->fid);
- yfs_check_req(call, bp);
-
- trace_afs_make_fs_call(call, &vp->fid);
- afs_make_op_call(op, call, GFP_NOFS);
-}
-
-/*
* Deliver reply data to an YFS.FetchData64.
*/
static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
@@ -1607,12 +1540,36 @@ void yfs_fs_release_lock(struct afs_operation *op)
}
/*
+ * Deliver a reply to YFS.FetchStatus
+ */
+static int yfs_deliver_fs_fetch_status(struct afs_call *call)
+{
+ struct afs_operation *op = call->op;
+ struct afs_vnode_param *vp = &op->file[op->fetch_status.which];
+ const __be32 *bp;
+ int ret;
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_YFSFetchStatus(&bp, call, &vp->scb);
+ xdr_decode_YFSCallBack(&bp, call, &vp->scb);
+ xdr_decode_YFSVolSync(&bp, &op->volsync);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
* YFS.FetchStatus operation type
*/
static const struct afs_call_type yfs_RXYFSFetchStatus = {
.name = "YFS.FetchStatus",
.op = yfs_FS_FetchStatus,
- .deliver = yfs_deliver_fs_status_cb_and_volsync,
+ .deliver = yfs_deliver_fs_fetch_status,
.destructor = afs_flat_call_destructor,
};
@@ -1621,7 +1578,7 @@ static const struct afs_call_type yfs_RXYFSFetchStatus = {
*/
void yfs_fs_fetch_status(struct afs_operation *op)
{
- struct afs_vnode_param *vp = &op->file[0];
+ struct afs_vnode_param *vp = &op->file[op->fetch_status.which];
struct afs_call *call;
__be32 *bp;
diff --git a/fs/aio.c b/fs/aio.c
index 6483f9274d5e..91e7cc4a9f17 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -27,7 +27,6 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
-#include <linux/mmu_context.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/timer.h>
@@ -68,7 +67,7 @@ struct aio_ring {
unsigned header_length; /* size of aio_ring */
- struct io_event io_events[0];
+ struct io_event io_events[];
}; /* 128 bytes + ring size */
/*
@@ -520,7 +519,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->mmap_size = nr_pages * PAGE_SIZE;
pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
- if (down_write_killable(&mm->mmap_sem)) {
+ if (mmap_write_lock_killable(mm)) {
ctx->mmap_size = 0;
aio_free_ring(ctx);
return -EINTR;
@@ -529,7 +528,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
PROT_READ | PROT_WRITE,
MAP_SHARED, 0, &unused, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
if (IS_ERR((void *)ctx->mmap_base)) {
ctx->mmap_size = 0;
aio_free_ring(ctx);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e5d50bd880e9..9fe3b51c116a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -208,7 +208,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
size_t len = strlen(k_platform) + 1;
u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
- if (__copy_to_user(u_platform, k_platform, len))
+ if (copy_to_user(u_platform, k_platform, len))
return -EFAULT;
}
@@ -221,7 +221,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
size_t len = strlen(k_base_platform) + 1;
u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
- if (__copy_to_user(u_base_platform, k_base_platform, len))
+ if (copy_to_user(u_base_platform, k_base_platform, len))
return -EFAULT;
}
@@ -231,7 +231,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
u_rand_bytes = (elf_addr_t __user *)
STACK_ALLOC(p, sizeof(k_rand_bytes));
- if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+ if (copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
return -EFAULT;
/* Create the ELF interpreter info */
@@ -314,21 +314,21 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
return -EFAULT;
/* Now, let's put argc (and argv, envp if appropriate) on the stack */
- if (__put_user(argc, sp++))
+ if (put_user(argc, sp++))
return -EFAULT;
/* Populate list of argv pointers back to argv strings. */
p = mm->arg_end = mm->arg_start;
while (argc-- > 0) {
size_t len;
- if (__put_user((elf_addr_t)p, sp++))
+ if (put_user((elf_addr_t)p, sp++))
return -EFAULT;
len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
p += len;
}
- if (__put_user(0, sp++))
+ if (put_user(0, sp++))
return -EFAULT;
mm->arg_end = p;
@@ -336,14 +336,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
mm->env_end = mm->env_start = p;
while (envc-- > 0) {
size_t len;
- if (__put_user((elf_addr_t)p, sp++))
+ if (put_user((elf_addr_t)p, sp++))
return -EFAULT;
len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
p += len;
}
- if (__put_user(0, sp++))
+ if (put_user(0, sp++))
return -EFAULT;
mm->env_end = p;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aaf332d32326..0f45521b237c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -536,7 +536,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
platform_len = strlen(k_platform) + 1;
sp -= platform_len;
u_platform = (char __user *) sp;
- if (__copy_to_user(u_platform, k_platform, platform_len) != 0)
+ if (copy_to_user(u_platform, k_platform, platform_len) != 0)
return -EFAULT;
}
@@ -551,7 +551,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
platform_len = strlen(k_base_platform) + 1;
sp -= platform_len;
u_base_platform = (char __user *) sp;
- if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0)
+ if (copy_to_user(u_base_platform, k_base_platform, platform_len) != 0)
return -EFAULT;
}
@@ -603,11 +603,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
/* put the ELF interpreter info on the stack */
#define NEW_AUX_ENT(id, val) \
do { \
- struct { unsigned long _id, _val; } __user *ent; \
+ struct { unsigned long _id, _val; } __user *ent, v; \
\
ent = (void __user *) csp; \
- __put_user((id), &ent[nr]._id); \
- __put_user((val), &ent[nr]._val); \
+ v._id = (id); \
+ v._val = (val); \
+ if (copy_to_user(ent + nr, &v, sizeof(v))) \
+ return -EFAULT; \
nr++; \
} while (0)
@@ -674,7 +676,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
/* stack argc */
csp -= sizeof(unsigned long);
- __put_user(bprm->argc, (unsigned long __user *) csp);
+ if (put_user(bprm->argc, (unsigned long __user *) csp))
+ return -EFAULT;
BUG_ON(csp != sp);
@@ -688,25 +691,29 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
p = (char __user *) current->mm->arg_start;
for (loop = bprm->argc; loop > 0; loop--) {
- __put_user((elf_caddr_t) p, argv++);
+ if (put_user((elf_caddr_t) p, argv++))
+ return -EFAULT;
len = strnlen_user(p, MAX_ARG_STRLEN);
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
p += len;
}
- __put_user(NULL, argv);
+ if (put_user(NULL, argv))
+ return -EFAULT;
current->mm->arg_end = (unsigned long) p;
/* fill in the envv[] array */
current->mm->env_start = (unsigned long) p;
for (loop = bprm->envc; loop > 0; loop--) {
- __put_user((elf_caddr_t)(unsigned long) p, envp++);
+ if (put_user((elf_caddr_t)(unsigned long) p, envp++))
+ return -EFAULT;
len = strnlen_user(p, MAX_ARG_STRLEN);
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
p += len;
}
- __put_user(NULL, envp);
+ if (put_user(NULL, envp))
+ return -EFAULT;
current->mm->env_end = (unsigned long) p;
mm->start_stack = (unsigned long) sp;
@@ -848,8 +855,8 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
tmp = phdr->p_memsz / sizeof(Elf32_Dyn);
dyn = (Elf32_Dyn __user *)params->dynamic_addr;
- __get_user(d_tag, &dyn[tmp - 1].d_tag);
- if (d_tag != 0)
+ if (get_user(d_tag, &dyn[tmp - 1].d_tag) ||
+ d_tag != 0)
goto dynamic_error;
break;
}
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 87ce229c63cf..f2f9086ebe98 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -138,35 +138,40 @@ static int create_flat_tables(struct linux_binprm *bprm, unsigned long arg_start
current->mm->start_stack = (unsigned long)sp & -FLAT_STACK_ALIGN;
sp = (unsigned long __user *)current->mm->start_stack;
- __put_user(bprm->argc, sp++);
+ if (put_user(bprm->argc, sp++))
+ return -EFAULT;
if (IS_ENABLED(CONFIG_BINFMT_FLAT_ARGVP_ENVP_ON_STACK)) {
unsigned long argv, envp;
argv = (unsigned long)(sp + 2);
envp = (unsigned long)(sp + 2 + bprm->argc + 1);
- __put_user(argv, sp++);
- __put_user(envp, sp++);
+ if (put_user(argv, sp++) || put_user(envp, sp++))
+ return -EFAULT;
}
current->mm->arg_start = (unsigned long)p;
for (i = bprm->argc; i > 0; i--) {
- __put_user((unsigned long)p, sp++);
+ if (put_user((unsigned long)p, sp++))
+ return -EFAULT;
len = strnlen_user(p, MAX_ARG_STRLEN);
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
p += len;
}
- __put_user(0, sp++);
+ if (put_user(0, sp++))
+ return -EFAULT;
current->mm->arg_end = (unsigned long)p;
current->mm->env_start = (unsigned long) p;
for (i = bprm->envc; i > 0; i--) {
- __put_user((unsigned long)p, sp++);
+ if (put_user((unsigned long)p, sp++))
+ return -EFAULT;
len = strnlen_user(p, MAX_ARG_STRLEN);
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
p += len;
}
- __put_user(0, sp++);
+ if (put_user(0, sp++))
+ return -EFAULT;
current->mm->env_end = (unsigned long)p;
return 0;
@@ -996,7 +1001,8 @@ static int load_flat_binary(struct linux_binprm *bprm)
unsigned long __user *sp;
current->mm->start_stack -= sizeof(unsigned long);
sp = (unsigned long __user *)current->mm->start_stack;
- __put_user(start_addr, sp);
+ if (put_user(start_addr, sp))
+ return -EFAULT;
start_addr = libinfo.lib_list[i].entry;
}
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 68b95ad82126..575636f6491e 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,7 +14,6 @@ config BTRFS_FS
select LZO_DECOMPRESS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
- select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
select SRCU
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index aeff56a0e105..e7d709505cb1 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -28,6 +28,7 @@ enum {
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
+ BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
};
@@ -312,6 +313,23 @@ struct btrfs_dio_private {
u8 csums[];
};
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
+{
+ set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
+ smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
+{
+ smp_mb__before_atomic();
+ clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
+}
+
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 161533040978..30ce7039bc27 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,7 +28,6 @@
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
#include <linux/crc32c.h>
-#include <linux/iomap.h>
#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
@@ -2934,9 +2933,6 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
-ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
-extern const struct iomap_ops btrfs_dio_iomap_ops;
-extern const struct iomap_dio_ops btrfs_dops;
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fde125616687..2c14312b05e8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1809,61 +1809,21 @@ again:
return num_written ? num_written : ret;
}
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- const unsigned int blocksize_mask = fs_info->sectorsize - 1;
-
- if (offset & blocksize_mask)
- return -EINVAL;
-
- if (iov_iter_alignment(iter) & blocksize_mask)
- return -EINVAL;
-
- return 0;
-}
-
-static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- loff_t pos = iocb->ki_pos;
- ssize_t written = 0;
+ loff_t pos;
+ ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
int err;
- size_t count = 0;
- bool relock = false;
- if (check_direct_IO(fs_info, from, pos))
- goto buffered;
-
- count = iov_iter_count(from);
- /*
- * If the write DIO is beyond the EOF, we need update the isize, but it
- * is protected by i_mutex. So we can not unlock the i_mutex at this
- * case.
- */
- if (pos + count <= inode->i_size) {
- inode_unlock(inode);
- relock = true;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- return -EAGAIN;
- }
-
- down_read(&BTRFS_I(inode)->dio_sem);
- written = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dops,
- is_sync_kiocb(iocb));
- up_read(&BTRFS_I(inode)->dio_sem);
-
- if (relock)
- inode_lock(inode);
+ written = generic_file_direct_write(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
-buffered:
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) {
@@ -2002,7 +1962,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
- num_written = btrfs_direct_write(iocb, from);
+ num_written = __btrfs_direct_write(iocb, from);
} else {
num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0)
@@ -3516,54 +3476,9 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
-static int check_direct_read(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- int ret;
- int i, seg;
-
- ret = check_direct_IO(fs_info, iter, offset);
- if (ret < 0)
- return ret;
-
- for (seg = 0; seg < iter->nr_segs; seg++)
- for (i = seg + 1; i < iter->nr_segs; i++)
- if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
- return -EINVAL;
- return 0;
-}
-
-static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- ssize_t ret;
-
- if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
- return 0;
-
- inode_lock_shared(inode);
- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dops,
- is_sync_kiocb(iocb));
- inode_unlock_shared(inode);
- return ret;
-}
-
-static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
- ssize_t ret = 0;
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = btrfs_direct_read(iocb, to);
- if (ret < 0)
- return ret;
- }
-
- return generic_file_buffered_read(iocb, to, ret);
-}
-
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = btrfs_file_read_iter,
+ .read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.mmap = btrfs_file_mmap,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 31ac8c682f19..d04c82c88418 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5,6 +5,7 @@
#include <linux/kernel.h>
#include <linux/bio.h>
+#include <linux/buffer_head.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -57,9 +58,9 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
u64 reserve;
- loff_t length;
- ssize_t submitted;
- struct extent_changeset *data_reserved;
+ u64 unsubmitted_oe_range_start;
+ u64 unsubmitted_oe_range_end;
+ int overwrite;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -4810,7 +4811,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
truncate_setsize(inode, newsize);
+ /* Disable nonlocked read DIO to avoid the endless truncate */
+ btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
+ btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -7041,7 +7045,7 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, bool writing)
+ struct extent_state **cached_state, int writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
@@ -7179,7 +7183,30 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
}
+static int btrfs_get_blocks_direct_read(struct extent_map *em,
+ struct buffer_head *bh_result,
+ struct inode *inode,
+ u64 start, u64 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ return -ENOENT;
+
+ len = min(len, em->len - (start - em->start));
+
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
+ set_buffer_mapped(bh_result);
+
+ return 0;
+}
+
static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
@@ -7241,6 +7268,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
}
/* this will cow the extent */
+ len = bh_result->b_size;
free_extent_map(em);
*map = em = btrfs_new_extent_direct(inode, start, len);
if (IS_ERR(em)) {
@@ -7251,73 +7279,64 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
skip_cow:
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
+ set_buffer_mapped(bh_result);
+
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ set_buffer_new(bh_result);
+
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (start + len > i_size_read(inode))
+ if (!dio_data->overwrite && start + len > i_size_read(inode))
i_size_write(inode, start + len);
+ WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
+ dio_data->unsubmitted_oe_range_end = start + len;
+ current->journal_info = dio_data;
out:
return ret;
}
-static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
- loff_t length, unsigned flags, struct iomap *iomap,
- struct iomap *srcmap)
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
+ u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
- const bool write = !!(flags & IOMAP_WRITE);
+ u64 len = bh_result->b_size;
int ret = 0;
- u64 len = length;
- bool unlock_extents = false;
- if (!write)
+ if (!create)
len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
lockend = start + len - 1;
- /*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so we
- * need to flush the dirty pages again to make absolutely sure that any
- * outstanding dirty pages are on disk.
- */
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- start + length - 1);
-
- dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
- if (!dio_data)
- return -ENOMEM;
-
- dio_data->length = length;
- if (write) {
- dio_data->reserve = round_up(length, fs_info->sectorsize);
- ret = btrfs_delalloc_reserve_space(inode,
- &dio_data->data_reserved,
- start, dio_data->reserve);
- if (ret) {
- extent_changeset_free(dio_data->data_reserved);
- kfree(dio_data);
- return ret;
- }
+ if (current->journal_info) {
+ /*
+ * Need to pull our outstanding extents and set journal_info to NULL so
+ * that anything that needs to check if there's a transaction doesn't get
+ * confused.
+ */
+ dio_data = current->journal_info;
+ current->journal_info = NULL;
}
- iomap->private = dio_data;
-
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
+ create)) {
ret = -ENOTBLK;
goto err;
}
@@ -7349,47 +7368,35 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
goto unlock_err;
}
- len = min(len, em->len - (start - em->start));
- if (write) {
- ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
- start, len);
+ if (create) {
+ ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
+ dio_data, start, len);
if (ret < 0)
goto unlock_err;
- unlock_extents = true;
- /* Recalc len in case the new em is smaller than requested */
- len = min(len, em->len - (start - em->start));
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
} else {
+ ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
+ start, len);
+ /* Can be negative only if we read from a hole */
+ if (ret < 0) {
+ ret = 0;
+ free_extent_map(em);
+ goto unlock_err;
+ }
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + len;
- if (lockstart < lockend)
- unlock_extents = true;
- }
-
- if (unlock_extents)
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- else
- free_extent_state(cached_state);
-
- /*
- * Translate extent map information to iomap.
- * We trim the extents (and move the addr) even though iomap code does
- * that, since we have locked only the parts we are performing I/O in.
- */
- if ((em->block_start == EXTENT_MAP_HOLE) ||
- (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->type = IOMAP_HOLE;
- } else {
- iomap->addr = em->block_start + (start - em->start);
- iomap->type = IOMAP_MAPPED;
+ lockstart = start + bh_result->b_size;
+ if (lockstart < lockend) {
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
+ } else {
+ free_extent_state(cached_state);
+ }
}
- iomap->offset = start;
- iomap->bdev = fs_info->fs_devices->latest_bdev;
- iomap->length = len;
free_extent_map(em);
@@ -7399,53 +7406,8 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data) {
- btrfs_delalloc_release_space(inode, dio_data->data_reserved,
- start, dio_data->reserve, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
- extent_changeset_free(dio_data->data_reserved);
- kfree(dio_data);
- }
- return ret;
-}
-
-static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
- ssize_t written, unsigned flags, struct iomap *iomap)
-{
- int ret = 0;
- struct btrfs_dio_data *dio_data = iomap->private;
- size_t submitted = dio_data->submitted;
- const bool write = !!(flags & IOMAP_WRITE);
-
- if (!write && (iomap->type == IOMAP_HOLE)) {
- /* If reading from a hole, unlock and return */
- unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
- goto out;
- }
-
- if (submitted < length) {
- pos += submitted;
- length -= submitted;
- if (write)
- __endio_write_update_ordered(inode, pos, length, false);
- else
- unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1);
- ret = -ENOTBLK;
- }
-
- if (write) {
- if (dio_data->reserve)
- btrfs_delalloc_release_space(inode,
- dio_data->data_reserved, pos,
- dio_data->reserve, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
- extent_changeset_free(dio_data->data_reserved);
- }
-out:
- kfree(dio_data);
- iomap->private = NULL;
-
+ if (dio_data)
+ current->journal_info = dio_data;
return ret;
}
@@ -7468,7 +7430,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
dip->logical_offset + dip->bytes - 1);
}
- bio_endio(dip->dio_bio);
+ dio_end_io(dip->dio_bio);
kfree(dip);
}
@@ -7704,11 +7666,24 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
+
+ if (write) {
+ struct btrfs_dio_data *dio_data = current->journal_info;
+
+ /*
+ * Setting range start and end to the same value means that
+ * no cleanup will happen in btrfs_direct_IO
+ */
+ dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+ dip->bytes;
+ dio_data->unsubmitted_oe_range_start =
+ dio_data->unsubmitted_oe_range_end;
+ }
return dip;
}
-static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
- struct bio *dio_bio, loff_t file_offset)
+static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
+ loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
@@ -7725,7 +7700,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
- struct btrfs_dio_data *dio_data = iomap->private;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -7734,8 +7708,8 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(dio_bio);
- return BLK_QC_T_NONE;
+ dio_end_io(dio_bio);
+ return;
}
if (!write && csum) {
@@ -7806,27 +7780,156 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
goto out_err;
}
- dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
} while (submit_len > 0);
- return BLK_QC_T_NONE;
+ return;
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
- return BLK_QC_T_NONE;
}
-const struct iomap_ops btrfs_dio_iomap_ops = {
- .iomap_begin = btrfs_dio_iomap_begin,
- .iomap_end = btrfs_dio_iomap_end,
-};
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int seg;
+ int i;
+ unsigned int blocksize_mask = fs_info->sectorsize - 1;
+ ssize_t retval = -EINVAL;
-const struct iomap_dio_ops btrfs_dops = {
- .submit_io = btrfs_submit_direct,
-};
+ if (offset & blocksize_mask)
+ goto out;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ goto out;
+
+ /* If this is a write we don't need to check anymore */
+ if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
+ return 0;
+ /*
+ * Check to make sure we don't have duplicate iov_base's in this
+ * iovec, if so return EINVAL, otherwise we'll get csum errors
+ * when reading back.
+ */
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ for (i = seg + 1; i < iter->nr_segs; i++) {
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+ goto out;
+ }
+ }
+ retval = 0;
+out:
+ return retval;
+}
+
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_dio_data dio_data = { 0 };
+ struct extent_changeset *data_reserved = NULL;
+ loff_t offset = iocb->ki_pos;
+ size_t count = 0;
+ int flags = 0;
+ bool wakeup = true;
+ bool relock = false;
+ ssize_t ret;
+
+ if (check_direct_IO(fs_info, iter, offset))
+ return 0;
+
+ inode_dio_begin(inode);
+
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so
+ * we need to flush the dirty pages again to make absolutely sure
+ * that any outstanding dirty pages are on disk.
+ */
+ count = iov_iter_count(iter);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_fdatawrite_range(inode->i_mapping, offset,
+ offset + count - 1);
+
+ if (iov_iter_rw(iter) == WRITE) {
+ /*
+ * If the write DIO is beyond the EOF, we need update
+ * the isize, but it is protected by i_mutex. So we can
+ * not unlock the i_mutex at this case.
+ */
+ if (offset + count <= inode->i_size) {
+ dio_data.overwrite = 1;
+ inode_unlock(inode);
+ relock = true;
+ } else if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+ offset, count);
+ if (ret)
+ goto out;
+
+ /*
+ * We need to know how many extents we reserved so that we can
+ * do the accounting properly if we go over the number we
+ * originally calculated. Abuse current->journal_info for this.
+ */
+ dio_data.reserve = round_up(count,
+ fs_info->sectorsize);
+ dio_data.unsubmitted_oe_range_start = (u64)offset;
+ dio_data.unsubmitted_oe_range_end = (u64)offset;
+ current->journal_info = &dio_data;
+ down_read(&BTRFS_I(inode)->dio_sem);
+ } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags)) {
+ inode_dio_end(inode);
+ flags = DIO_LOCKING | DIO_SKIP_HOLES;
+ wakeup = false;
+ }
+
+ ret = __blockdev_direct_IO(iocb, inode,
+ fs_info->fs_devices->latest_bdev,
+ iter, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, flags);
+ if (iov_iter_rw(iter) == WRITE) {
+ up_read(&BTRFS_I(inode)->dio_sem);
+ current->journal_info = NULL;
+ if (ret < 0 && ret != -EIOCBQUEUED) {
+ if (dio_data.reserve)
+ btrfs_delalloc_release_space(inode, data_reserved,
+ offset, dio_data.reserve, true);
+ /*
+ * On error we might have left some ordered extents
+ * without submitting corresponding bios for them, so
+ * cleanup them up to avoid other tasks getting them
+ * and waiting for them to complete forever.
+ */
+ if (dio_data.unsubmitted_oe_range_start <
+ dio_data.unsubmitted_oe_range_end)
+ __endio_write_update_ordered(inode,
+ dio_data.unsubmitted_oe_range_start,
+ dio_data.unsubmitted_oe_range_end -
+ dio_data.unsubmitted_oe_range_start,
+ false);
+ } else if (ret >= 0 && (size_t)ret < count)
+ btrfs_delalloc_release_space(inode, data_reserved,
+ offset, count - (size_t)ret, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), count);
+ }
+out:
+ if (wakeup)
+ inode_dio_end(inode);
+ if (relock)
+ inode_lock(inode);
+
+ extent_changeset_free(data_reserved);
+ return ret;
+}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
@@ -10122,7 +10225,7 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
- .direct_IO = noop_direct_IO,
+ .direct_IO = btrfs_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index b7420e605b28..0f2adecb94f2 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -53,13 +53,6 @@ const struct fscache_cookie_def cifs_fscache_server_index_def = {
.type = FSCACHE_COOKIE_TYPE_INDEX,
};
-/*
- * Auxiliary data attached to CIFS superblock within the cache
- */
-struct cifs_fscache_super_auxdata {
- u64 resource_id; /* unique server resource id */
-};
-
char *extract_sharename(const char *treename)
{
const char *src;
@@ -98,6 +91,8 @@ fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
memset(&auxdata, 0, sizeof(auxdata));
auxdata.resource_id = tcon->resource_id;
+ auxdata.vol_create_time = tcon->vol_create_time;
+ auxdata.vol_serial_number = tcon->vol_serial_number;
if (memcmp(data, &auxdata, datalen) != 0)
return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 3ad1a98fd567..fc98b97b396a 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -221,8 +221,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
struct cifs_ses *ses;
struct cifs_tcon *tcon;
int i, j;
- const char *security_types[] = {"Unspecified", "LANMAN", "NTLM",
- "NTLMv2", "RawNTLMSSP", "Kerberos"};
seq_puts(m,
"Display Internal CIFS Data Structures for Debugging\n"
@@ -379,7 +377,7 @@ skip_rdma:
}
seq_printf(m,"Security type: %s\n",
- security_types[server->ops->select_sectype(server, ses->sectype)]);
+ get_security_type_str(server->ops->select_sectype(server, ses->sectype)));
if (server->rdma)
seq_printf(m, "RDMA\n\t");
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index ae421634aa42..6025d7fc7bbf 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -849,6 +849,28 @@ unsigned int setup_special_mode_ACE(struct cifs_ace *pntace, __u64 nmode)
return ace_size;
}
+unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace)
+{
+ int i;
+ unsigned int ace_size = 28;
+
+ pntace->type = ACCESS_ALLOWED_ACE_TYPE;
+ pntace->flags = 0x0;
+ pntace->access_req = cpu_to_le32(GENERIC_ALL);
+ pntace->sid.num_subauth = 3;
+ pntace->sid.revision = 1;
+ for (i = 0; i < NUM_AUTHS; i++)
+ pntace->sid.authority[i] = sid_unix_NFS_users.authority[i];
+
+ pntace->sid.sub_auth[0] = sid_unix_NFS_users.sub_auth[0];
+ pntace->sid.sub_auth[1] = sid_unix_NFS_users.sub_auth[1];
+ pntace->sid.sub_auth[2] = cpu_to_le32(current_fsgid().val);
+
+ /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */
+ pntace->size = cpu_to_le16(ace_size);
+ return ace_size;
+}
+
static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
struct cifs_sid *pgrpsid, __u64 nmode, bool modefromsid)
{
@@ -978,7 +1000,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
/* Convert permission bits from mode to equivalent CIFS ACL */
static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
__u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid,
- bool mode_from_sid, int *aclflag)
+ bool mode_from_sid, bool id_from_sid, int *aclflag)
{
int rc = 0;
__u32 dacloffset;
@@ -1019,12 +1041,23 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
if (!nowner_sid_ptr)
return -ENOMEM;
id = from_kuid(&init_user_ns, uid);
- rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);
- if (rc) {
- cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n",
- __func__, rc, id);
- kfree(nowner_sid_ptr);
- return rc;
+ if (id_from_sid) {
+ struct owner_sid *osid = (struct owner_sid *)nowner_sid_ptr;
+ /* Populate the user ownership fields S-1-5-88-1 */
+ osid->Revision = 1;
+ osid->NumAuth = 3;
+ osid->Authority[5] = 5;
+ osid->SubAuthorities[0] = cpu_to_le32(88);
+ osid->SubAuthorities[1] = cpu_to_le32(1);
+ osid->SubAuthorities[2] = cpu_to_le32(id);
+ } else { /* lookup sid with upcall */
+ rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);
+ if (rc) {
+ cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n",
+ __func__, rc, id);
+ kfree(nowner_sid_ptr);
+ return rc;
+ }
}
cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr);
kfree(nowner_sid_ptr);
@@ -1039,12 +1072,23 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
if (!ngroup_sid_ptr)
return -ENOMEM;
id = from_kgid(&init_user_ns, gid);
- rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);
- if (rc) {
- cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n",
- __func__, rc, id);
- kfree(ngroup_sid_ptr);
- return rc;
+ if (id_from_sid) {
+ struct owner_sid *gsid = (struct owner_sid *)ngroup_sid_ptr;
+ /* Populate the group ownership fields S-1-5-88-2 */
+ gsid->Revision = 1;
+ gsid->NumAuth = 3;
+ gsid->Authority[5] = 5;
+ gsid->SubAuthorities[0] = cpu_to_le32(88);
+ gsid->SubAuthorities[1] = cpu_to_le32(2);
+ gsid->SubAuthorities[2] = cpu_to_le32(id);
+ } else { /* lookup sid with upcall */
+ rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);
+ if (rc) {
+ cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n",
+ __func__, rc, id);
+ kfree(ngroup_sid_ptr);
+ return rc;
+ }
}
cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr);
kfree(ngroup_sid_ptr);
@@ -1247,7 +1291,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
struct smb_version_operations *ops;
- bool mode_from_sid;
+ bool mode_from_sid, id_from_sid;
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -1290,8 +1334,13 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
else
mode_from_sid = false;
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL)
+ id_from_sid = true;
+ else
+ id_from_sid = false;
+
rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
- mode_from_sid, &aclflag);
+ mode_from_sid, id_from_sid, &aclflag);
cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 21d7dee98d01..17562ea00e18 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -176,6 +176,21 @@ struct smb3_acl {
__le16 Sbz2; /* MBZ */
} __packed;
+/*
+ * Used to store the special 'NFS SIDs' used to persist the POSIX uid and gid
+ * See See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx
+ */
+struct owner_sid {
+ u8 Revision;
+ u8 NumAuth;
+ u8 Authority[6];
+ __le32 SubAuthorities[3];
+} __packed;
+
+struct owner_group_sids {
+ struct owner_sid owner;
+ struct owner_sid group;
+} __packed;
/*
* Minimum security identifier can be one for system defined Users
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 889f9c71049b..0fb99d25e8a8 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -623,7 +623,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
if (tcon->ses->chan_max > 1)
- seq_printf(s, ",multichannel,max_channel=%zu",
+ seq_printf(s, ",multichannel,max_channels=%zu",
tcon->ses->chan_max);
return 0;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e133bb3e172f..fad37d61910a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -2008,6 +2008,24 @@ extern struct smb_version_values smb302_values;
extern struct smb_version_operations smb311_operations;
extern struct smb_version_values smb311_values;
+static inline char *get_security_type_str(enum securityEnum sectype)
+{
+ switch (sectype) {
+ case RawNTLMSSP:
+ return "RawNTLMSSP";
+ case Kerberos:
+ return "Kerberos";
+ case NTLMv2:
+ return "NTLMv2";
+ case NTLM:
+ return "NTLM";
+ case LANMAN:
+ return "LANMAN";
+ default:
+ return "Unknown";
+ }
+}
+
static inline bool is_smb1_server(struct TCP_Server_Info *server)
{
return strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index bd92070ca30c..7a836ec0438e 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -198,6 +198,8 @@ extern struct inode *cifs_iget(struct super_block *sb,
extern int cifs_get_inode_info(struct inode **inode, const char *full_path,
FILE_ALL_INFO *data, struct super_block *sb,
int xid, const struct cifs_fid *fid);
+extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path,
+ struct super_block *sb, unsigned int xid);
extern int cifs_get_inode_info_unix(struct inode **pinode,
const unsigned char *search_path,
struct super_block *sb, unsigned int xid);
@@ -220,6 +222,7 @@ extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
const char *, int);
extern unsigned int setup_authusers_ACE(struct cifs_ace *pace);
extern unsigned int setup_special_mode_ACE(struct cifs_ace *pace, __u64 nmode);
+extern unsigned int setup_special_user_owner_ACE(struct cifs_ace *pace);
extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 36e7b2fd2190..398c1eef7190 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -411,6 +411,7 @@ cifs_create_get_file_info:
rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb,
xid);
else {
+ /* TODO: Add support for calling POSIX query info here, but passing in fid */
rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
xid, fid);
if (newinode) {
@@ -700,7 +701,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
cifs_dbg(FYI, "Full path: %s inode = 0x%p\n",
full_path, d_inode(direntry));
- if (pTcon->unix_ext) {
+ if (pTcon->posix_extensions)
+ rc = smb311_posix_get_inode_info(&newInode, full_path, parent_dir_inode->i_sb, xid);
+ else if (pTcon->unix_ext) {
rc = cifs_get_inode_info_unix(&newInode, full_path,
parent_dir_inode->i_sb, xid);
} else {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8277859d12a3..4fe757cfc360 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -243,6 +243,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
if (rc)
goto out;
+ /* TODO: Add support for calling posix query info but with passing in fid */
if (tcon->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
xid);
@@ -800,7 +801,9 @@ reopen_success:
if (!is_interrupt_error(rc))
mapping_set_error(inode->i_mapping, rc);
- if (tcon->unix_ext)
+ if (tcon->posix_extensions)
+ rc = smb311_posix_get_inode_info(&inode, full_path, inode->i_sb, xid);
+ else if (tcon->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path,
inode->i_sb, xid);
else
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index ea6ace9c2417..da688185403c 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -96,6 +96,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
{
struct TCP_Server_Info *server = tcon->ses->server;
char *sharename;
+ struct cifs_fscache_super_auxdata auxdata;
sharename = extract_sharename(tcon->treeName);
if (IS_ERR(sharename)) {
@@ -104,11 +105,16 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
return;
}
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.resource_id = tcon->resource_id;
+ auxdata.vol_create_time = tcon->vol_create_time;
+ auxdata.vol_serial_number = tcon->vol_serial_number;
+
tcon->fscache =
fscache_acquire_cookie(server->fscache,
&cifs_fscache_super_index_def,
sharename, strlen(sharename),
- &tcon->resource_id, sizeof(tcon->resource_id),
+ &auxdata, sizeof(auxdata),
tcon, 0, true);
kfree(sharename);
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
@@ -117,8 +123,15 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
{
+ struct cifs_fscache_super_auxdata auxdata;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.resource_id = tcon->resource_id;
+ auxdata.vol_create_time = tcon->vol_create_time;
+ auxdata.vol_serial_number = tcon->vol_serial_number;
+
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache);
- fscache_relinquish_cookie(tcon->fscache, &tcon->resource_id, false);
+ fscache_relinquish_cookie(tcon->fscache, &auxdata, false);
tcon->fscache = NULL;
}
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 8c0862e41306..1091633d2adb 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -28,6 +28,15 @@
#ifdef CONFIG_CIFS_FSCACHE
/*
+ * Auxiliary data attached to CIFS superblock within the cache
+ */
+struct cifs_fscache_super_auxdata {
+ u64 resource_id; /* unique server resource id */
+ __le64 vol_create_time;
+ u32 vol_serial_number;
+} __packed;
+
+/*
* Auxiliary data attached to CIFS inode within the cache
*/
struct cifs_fscache_inode_auxdata {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5072bcaf4be1..583f5e4008c2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,6 +32,7 @@
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
+#include "smb2proto.h"
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
@@ -595,6 +596,62 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
#endif
}
+/* Fill a cifs_fattr struct with info from POSIX info struct */
+static void
+smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *info,
+ struct super_block *sb, bool adjust_tz, bool symlink)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+ memset(fattr, 0, sizeof(*fattr));
+
+ /* no fattr->flags to set */
+ fattr->cf_cifsattrs = le32_to_cpu(info->DosAttributes);
+ fattr->cf_uniqueid = le64_to_cpu(info->Inode);
+
+ if (info->LastAccessTime)
+ fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+ else
+ ktime_get_coarse_real_ts64(&fattr->cf_atime);
+
+ fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
+ fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
+
+ if (adjust_tz) {
+ fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
+ fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
+ }
+
+ fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+ fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+ fattr->cf_createtime = le64_to_cpu(info->CreationTime);
+
+ fattr->cf_nlink = le32_to_cpu(info->HardLinks);
+ fattr->cf_mode = (umode_t) le32_to_cpu(info->Mode);
+ /* The srv fs device id is overridden on network mount so setting rdev isn't needed here */
+ /* fattr->cf_rdev = le32_to_cpu(info->DeviceId); */
+
+ if (symlink) {
+ fattr->cf_mode |= S_IFLNK;
+ fattr->cf_dtype = DT_LNK;
+ } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+ fattr->cf_mode |= S_IFDIR;
+ fattr->cf_dtype = DT_DIR;
+ } else { /* file */
+ fattr->cf_mode |= S_IFREG;
+ fattr->cf_dtype = DT_REG;
+ }
+ /* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */
+
+ fattr->cf_uid = cifs_sb->mnt_uid; /* TODO: map uid and gid from SID */
+ fattr->cf_gid = cifs_sb->mnt_gid;
+
+ cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n",
+ fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
+}
+
+
/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
static void
cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
@@ -1023,6 +1080,121 @@ out:
return rc;
}
+int
+smb311_posix_get_inode_info(struct inode **inode,
+ const char *full_path,
+ struct super_block *sb, unsigned int xid)
+{
+ struct cifs_tcon *tcon;
+ struct TCP_Server_Info *server;
+ struct tcon_link *tlink;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ bool adjust_tz = false;
+ struct cifs_fattr fattr = {0};
+ bool symlink = false;
+ struct smb311_posix_qinfo *data = NULL;
+ int rc = 0;
+ int tmprc = 0;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
+
+ /*
+ * 1. Fetch file metadata
+ */
+
+ if (is_inode_cache_good(*inode)) {
+ cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
+ goto out;
+ }
+ data = kmalloc(sizeof(struct smb311_posix_qinfo), GFP_KERNEL);
+ if (!data) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = smb311_posix_query_path_info(xid, tcon, cifs_sb,
+ full_path, data,
+ &adjust_tz, &symlink);
+
+ /*
+ * 2. Convert it to internal cifs metadata (fattr)
+ */
+
+ switch (rc) {
+ case 0:
+ smb311_posix_info_to_fattr(&fattr, data, sb, adjust_tz, symlink);
+ break;
+ case -EREMOTE:
+ /* DFS link, no metadata available on this server */
+ cifs_create_dfs_fattr(&fattr, sb);
+ rc = 0;
+ break;
+ case -EACCES:
+ /*
+ * For SMB2 and later the backup intent flag
+ * is already sent if needed on open and there
+ * is no path based FindFirst operation to use
+ * to retry with so nothing we can do, bail out
+ */
+ goto out;
+ default:
+ cifs_dbg(FYI, "%s: unhandled err rc %d\n", __func__, rc);
+ goto out;
+ }
+
+
+ /*
+ * 3. Tweak fattr based on mount options
+ */
+
+ /* check for Minshall+French symlinks */
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+ tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
+ full_path);
+ if (tmprc)
+ cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
+ }
+
+ /*
+ * 4. Update inode with final fattr data
+ */
+
+ if (!*inode) {
+ *inode = cifs_iget(sb, &fattr);
+ if (!*inode)
+ rc = -ENOMEM;
+ } else {
+ /* we already have inode, update it */
+
+ /* if uniqueid is different, return error */
+ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
+ CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+ CIFS_I(*inode)->time = 0; /* force reval */
+ rc = -ESTALE;
+ goto out;
+ }
+
+ /* if filetype is different, return error */
+ if (unlikely(((*inode)->i_mode & S_IFMT) !=
+ (fattr.cf_mode & S_IFMT))) {
+ CIFS_I(*inode)->time = 0; /* force reval */
+ rc = -ESTALE;
+ goto out;
+ }
+
+ cifs_fattr_to_inode(*inode, &fattr);
+ }
+out:
+ cifs_put_tlink(tlink);
+ kfree(data);
+ return rc;
+}
+
+
static const struct inode_operations cifs_ipc_inode_ops = {
.lookup = cifs_lookup,
};
@@ -1161,7 +1333,10 @@ struct inode *cifs_root_iget(struct super_block *sb)
}
convert_delimiter(path, CIFS_DIR_SEP(cifs_sb));
- rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL);
+ if (tcon->posix_extensions)
+ rc = smb311_posix_get_inode_info(&inode, path, sb, xid);
+ else
+ rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL);
iget_no_retry:
if (!inode) {
@@ -1517,7 +1692,9 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
int rc = 0;
struct inode *inode = NULL;
- if (tcon->unix_ext)
+ if (tcon->posix_extensions)
+ rc = smb311_posix_get_inode_info(&inode, full_path, parent->i_sb, xid);
+ else if (tcon->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb,
xid);
else
@@ -2114,7 +2291,9 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
dentry, cifs_get_time(dentry), jiffies);
again:
- if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
+ if (cifs_sb_master_tcon(CIFS_SB(sb))->posix_extensions)
+ rc = smb311_posix_get_inode_info(&inode, full_path, sb, xid);
+ else if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
else
rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index c381d2d03ef6..94dab4309fbb 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -701,7 +701,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
cifs_sb_target->local_nls); */
if (rc == 0) {
- if (pTcon->unix_ext)
+ if (pTcon->posix_extensions)
+ rc = smb311_posix_get_inode_info(&newinode, full_path, inode->i_sb, xid);
+ else if (pTcon->unix_ext)
rc = cifs_get_inode_info_unix(&newinode, full_path,
inode->i_sb, xid);
else
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index dd10f0ce4cd5..cf20f0b5d836 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -45,6 +45,7 @@
#define SMB2_OP_HARDLINK 8
#define SMB2_OP_SET_EOF 9
#define SMB2_OP_RMDIR 10
+#define SMB2_OP_POSIX_QUERY_INFO 11
/* Used when constructing chained read requests. */
#define CHAINED_REQUEST 1
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 0a116fc490a9..b9db73687eaa 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -166,6 +166,40 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid,
full_path);
break;
+ case SMB2_OP_POSIX_QUERY_INFO:
+ rqst[num_rqst].rq_iov = &vars->qi_iov[0];
+ rqst[num_rqst].rq_nvec = 1;
+
+ if (cfile)
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ SMB_FIND_FILE_POSIX_INFO,
+ SMB2_O_INFO_FILE, 0,
+ /* TBD: fix following to allow for longer SIDs */
+ sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) +
+ (sizeof(struct cifs_sid) * 2), 0, NULL);
+ else {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ SMB_FIND_FILE_POSIX_INFO,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) +
+ (sizeof(struct cifs_sid) * 2), 0, NULL);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
+ }
+
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_posix_query_info_compound_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
case SMB2_OP_DELETE:
trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
break;
@@ -379,6 +413,24 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
trace_smb3_query_info_compound_done(xid, ses->Suid,
tcon->tid);
break;
+ case SMB2_OP_POSIX_QUERY_INFO:
+ if (rc == 0) {
+ qi_rsp = (struct smb2_query_info_rsp *)
+ rsp_iov[1].iov_base;
+ rc = smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ le32_to_cpu(qi_rsp->OutputBufferLength),
+ &rsp_iov[1], sizeof(struct smb311_posix_qinfo) /* add SIDs */, ptr);
+ }
+ if (rqst[1].rq_iov)
+ SMB2_query_info_free(&rqst[1]);
+ if (rqst[2].rq_iov)
+ SMB2_close_free(&rqst[2]);
+ if (rc)
+ trace_smb3_posix_query_info_compound_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_posix_query_info_compound_done(xid, ses->Suid, tcon->tid);
+ break;
case SMB2_OP_DELETE:
if (rc)
trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
@@ -512,6 +564,59 @@ out:
return rc;
}
+
+int
+smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct smb311_posix_qinfo *data, bool *adjust_tz, bool *symlink)
+{
+ int rc;
+ __u32 create_options = 0;
+ struct cifsFileInfo *cfile;
+ struct smb311_posix_qinfo *smb2_data;
+
+ *adjust_tz = false;
+ *symlink = false;
+
+ /* BB TODO: Make struct larger when add support for parsing owner SIDs */
+ smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo),
+ GFP_KERNEL);
+ if (smb2_data == NULL)
+ return -ENOMEM;
+
+ /*
+ * BB TODO: Add support for using the cached root handle.
+ * Create SMB2_query_posix_info worker function to do non-compounded query
+ * when we already have an open file handle for this. For now this is fast enough
+ * (always using the compounded version).
+ */
+
+ cifs_get_readable_path(tcon, full_path, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
+ ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile);
+ if (rc == -EOPNOTSUPP) {
+ /* BB TODO: When support for special files added to Samba re-verify this path */
+ *symlink = true;
+ create_options |= OPEN_REPARSE_POINT;
+
+ /* Failed on a symbolic link - query a reparse point info */
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE,
+ smb2_data, SMB2_OP_POSIX_QUERY_INFO, NULL);
+ }
+ if (rc)
+ goto out;
+
+ /* TODO: will need to allow for the 2 SIDs when add support for getting owner UID/GID */
+ memcpy(data, smb2_data, sizeof(struct smb311_posix_qinfo));
+
+out:
+ kfree(smb2_data);
+ return rc;
+}
+
int
smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
struct cifs_tcon *tcon, const char *name,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index ded96b529a4d..2f4cdd290c46 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2317,28 +2317,75 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp)
return 0;
}
+/* See See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */
+static void setup_owner_group_sids(char *buf)
+{
+ struct owner_group_sids *sids = (struct owner_group_sids *)buf;
+
+ /* Populate the user ownership fields S-1-5-88-1 */
+ sids->owner.Revision = 1;
+ sids->owner.NumAuth = 3;
+ sids->owner.Authority[5] = 5;
+ sids->owner.SubAuthorities[0] = cpu_to_le32(88);
+ sids->owner.SubAuthorities[1] = cpu_to_le32(1);
+ sids->owner.SubAuthorities[2] = cpu_to_le32(current_fsuid().val);
+
+ /* Populate the group ownership fields S-1-5-88-2 */
+ sids->group.Revision = 1;
+ sids->group.NumAuth = 3;
+ sids->group.Authority[5] = 5;
+ sids->group.SubAuthorities[0] = cpu_to_le32(88);
+ sids->group.SubAuthorities[1] = cpu_to_le32(2);
+ sids->group.SubAuthorities[2] = cpu_to_le32(current_fsgid().val);
+
+ cifs_dbg(FYI, "owner S-1-5-88-1-%d, group S-1-5-88-2-%d\n", current_fsuid().val, current_fsgid().val);
+}
+
/* See MS-SMB2 2.2.13.2.2 and MS-DTYP 2.4.6 */
static struct crt_sd_ctxt *
-create_sd_buf(umode_t mode, unsigned int *len)
+create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
{
struct crt_sd_ctxt *buf;
struct cifs_ace *pace;
unsigned int sdlen, acelen;
+ unsigned int owner_offset = 0;
+ unsigned int group_offset = 0;
+
+ *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 2), 8);
+
+ if (set_owner) {
+ /* offset fields are from beginning of security descriptor not of create context */
+ owner_offset = sizeof(struct smb3_acl) + (sizeof(struct cifs_ace) * 2);
+
+ /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */
+ *len += sizeof(struct owner_group_sids);
+ }
- *len = roundup(sizeof(struct crt_sd_ctxt) + sizeof(struct cifs_ace) * 2,
- 8);
buf = kzalloc(*len, GFP_KERNEL);
if (buf == NULL)
return buf;
+ if (set_owner) {
+ buf->sd.OffsetOwner = cpu_to_le32(owner_offset);
+ group_offset = owner_offset + sizeof(struct owner_sid);
+ buf->sd.OffsetGroup = cpu_to_le32(group_offset);
+ } else {
+ buf->sd.OffsetOwner = 0;
+ buf->sd.OffsetGroup = 0;
+ }
+
sdlen = sizeof(struct smb3_sd) + sizeof(struct smb3_acl) +
2 * sizeof(struct cifs_ace);
+ if (set_owner) {
+ sdlen += sizeof(struct owner_group_sids);
+ setup_owner_group_sids(owner_offset + sizeof(struct create_context) + 8 /* name */
+ + (char *)buf);
+ }
buf->ccontext.DataOffset = cpu_to_le16(offsetof
(struct crt_sd_ctxt, sd));
buf->ccontext.DataLength = cpu_to_le32(sdlen);
- buf->ccontext.NameOffset = cpu_to_le16(offsetof
- (struct crt_sd_ctxt, Name));
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof(struct crt_sd_ctxt, Name));
buf->ccontext.NameLength = cpu_to_le16(4);
/* SMB2_CREATE_SD_BUFFER_TOKEN is "SecD" */
buf->Name[0] = 'S';
@@ -2359,23 +2406,34 @@ create_sd_buf(umode_t mode, unsigned int *len)
/* create one ACE to hold the mode embedded in reserved special SID */
pace = (struct cifs_ace *)(sizeof(struct crt_sd_ctxt) + (char *)buf);
acelen = setup_special_mode_ACE(pace, (__u64)mode);
+
+ if (set_owner) {
+ /* we do not need to reallocate buffer to add the two more ACEs. plenty of space */
+ pace = (struct cifs_ace *)(acelen + (sizeof(struct crt_sd_ctxt) + (char *)buf));
+ acelen += setup_special_user_owner_ACE(pace);
+ /* it does not appear necessary to add an ACE for the NFS group SID */
+ buf->acl.AceCount = cpu_to_le16(3);
+ } else
+ buf->acl.AceCount = cpu_to_le16(2);
+
/* and one more ACE to allow access for authenticated users */
pace = (struct cifs_ace *)(acelen + (sizeof(struct crt_sd_ctxt) +
(char *)buf));
acelen += setup_authusers_ACE(pace);
+
buf->acl.AclSize = cpu_to_le16(sizeof(struct cifs_acl) + acelen);
- buf->acl.AceCount = cpu_to_le16(2);
+
return buf;
}
static int
-add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
+add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode, bool set_owner)
{
struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
unsigned int len = 0;
- iov[num].iov_base = create_sd_buf(mode, &len);
+ iov[num].iov_base = create_sd_buf(mode, set_owner, &len);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = len;
@@ -2764,21 +2822,35 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
return rc;
}
- if ((oparms->disposition != FILE_OPEN) &&
- (oparms->cifs_sb) &&
- (oparms->cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) &&
- (oparms->mode != ACL_NO_MODE)) {
- if (n_iov > 2) {
- struct create_context *ccontext =
- (struct create_context *)iov[n_iov-1].iov_base;
- ccontext->Next =
- cpu_to_le32(iov[n_iov-1].iov_len);
+ if ((oparms->disposition != FILE_OPEN) && (oparms->cifs_sb)) {
+ bool set_mode;
+ bool set_owner;
+
+ if ((oparms->cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) &&
+ (oparms->mode != ACL_NO_MODE))
+ set_mode = true;
+ else {
+ set_mode = false;
+ oparms->mode = ACL_NO_MODE;
}
- cifs_dbg(FYI, "add sd with mode 0x%x\n", oparms->mode);
- rc = add_sd_context(iov, &n_iov, oparms->mode);
- if (rc)
- return rc;
+ if (oparms->cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL)
+ set_owner = true;
+ else
+ set_owner = false;
+
+ if (set_owner | set_mode) {
+ if (n_iov > 2) {
+ struct create_context *ccontext =
+ (struct create_context *)iov[n_iov-1].iov_base;
+ ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len);
+ }
+
+ cifs_dbg(FYI, "add sd with mode 0x%x\n", oparms->mode);
+ rc = add_sd_context(iov, &n_iov, oparms->mode, set_owner);
+ if (rc)
+ return rc;
+ }
}
if (n_iov > 2) {
@@ -2973,7 +3045,9 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
* response size smaller.
*/
req->MaxOutputResponse = cpu_to_le32(max_response_size);
- req->sync_hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(max_response_size, SMB2_MAX_BUFFER_SIZE));
+ req->sync_hdr.CreditCharge =
+ cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
+ SMB2_MAX_BUFFER_SIZE));
if (is_fsctl)
req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
else
@@ -3457,6 +3531,19 @@ int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
}
int
+SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, struct smb311_posix_qinfo *data, u32 *plen)
+{
+ size_t output_len = sizeof(struct smb311_posix_qinfo *) +
+ (sizeof(struct cifs_sid) * 2) + (PATH_MAX * 2);
+ *plen = 0;
+
+ return query_info(xid, tcon, persistent_fid, volatile_fid,
+ SMB_FIND_FILE_POSIX_INFO, SMB2_O_INFO_FILE, 0,
+ output_len, sizeof(struct smb311_posix_qinfo), (void **)&data, plen);
+}
+
+int
SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
void **data, u32 *plen)
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 3b0e6acf9d7d..236814d61248 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1653,7 +1653,7 @@ struct create_posix_rsp {
} __packed;
/*
- * SMB2-only POSIX info level
+ * SMB2-only POSIX info level for query dir
*
* See posix_info_sid_size(), posix_info_extra_size() and
* posix_info_parse() to help with the handling of this struct.
@@ -1683,6 +1683,31 @@ struct smb2_posix_info {
*/
} __packed;
+/* Level 100 query info */
+struct smb311_posix_qinfo {
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 DosAttributes;
+ __le64 Inode;
+ __le32 DeviceId;
+ __le32 Zero;
+ /* beginning of POSIX Create Context Response */
+ __le32 HardLinks;
+ __le32 ReparseTag;
+ __le32 Mode;
+ u8 Sids[];
+ /*
+ * var sized owner SID
+ * var sized group SID
+ * le32 filenamelength
+ * u8 filename[]
+ */
+} __packed;
+
/*
* Parsed version of the above struct. Allows direct access to the
* variable length fields
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 71ba74792c9e..2f8ecbf54214 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -182,6 +182,8 @@ extern int SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst,
struct TCP_Server_Info *server,
u64 persistent_file_id, u64 volatile_file_id);
extern void SMB2_flush_free(struct smb_rqst *rqst);
+extern int SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, struct smb311_posix_qinfo *data, u32 *plen);
extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
struct smb2_file_all_info *data);
@@ -289,6 +291,10 @@ extern int smb2_query_info_compound(const unsigned int xid,
u32 class, u32 type, u32 output_len,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb);
+/* query path info from the server using SMB311 POSIX extensions*/
+extern int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *sb, const char *path, struct smb311_posix_qinfo *qinf,
+ bool *adjust_tx, bool *symlink);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index 4cb0d5f7ce45..eef4b08c7208 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -318,6 +318,7 @@ DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name, \
TP_ARGS(xid, tid, sesid, full_path))
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
@@ -354,6 +355,7 @@ DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name, \
TP_ARGS(xid, tid, sesid))
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
@@ -395,6 +397,7 @@ DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name, \
TP_ARGS(xid, tid, sesid, rc))
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
diff --git a/fs/coredump.c b/fs/coredump.c
index 478a0d810136..7237f07ff6be 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -393,7 +393,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
* of ->siglock provides a memory barrier.
*
* do_exit:
- * The caller holds mm->mmap_sem. This means that the task which
+ * The caller holds mm->mmap_lock. This means that the task which
* uses this mm can't pass exit_mm(), so it can't exit or clear
* its ->mm.
*
@@ -401,7 +401,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
* It does list_replace_rcu(&leader->tasks, &current->tasks),
* we must see either old or new leader, this does not matter.
* However, it can change p->sighand, so lock_task_sighand(p)
- * must be used. Since p->mm != NULL and we hold ->mmap_sem
+ * must be used. Since p->mm != NULL and we hold ->mmap_lock
* it can't fail.
*
* Note also that "g" can be the old leader with ->mm == NULL
@@ -445,12 +445,12 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
if (!mm->core_state)
core_waiters = zap_threads(tsk, mm, core_state, exit_code);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
if (core_waiters > 0) {
struct core_thread *ptr;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1543b5af400e..6d5370eac2a8 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -386,6 +386,25 @@ static void dio_bio_end_io(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags);
}
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio)
+{
+ struct dio *dio = bio->bi_private;
+
+ if (dio->is_async)
+ dio_bio_end_aio(bio);
+ else
+ dio_bio_end_io(bio);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
+
static inline void
dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct block_device *bdev,
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 78e41c7c3d05..df466ef81ddd 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -23,6 +23,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/idr.h>
+#include <linux/uio.h>
DEFINE_PER_CPU(int, eventfd_wake_count);
@@ -216,32 +217,32 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w
}
EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
- loff_t *ppos)
+static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
{
+ struct file *file = iocb->ki_filp;
struct eventfd_ctx *ctx = file->private_data;
- ssize_t res;
__u64 ucnt = 0;
DECLARE_WAITQUEUE(wait, current);
- if (count < sizeof(ucnt))
+ if (iov_iter_count(to) < sizeof(ucnt))
return -EINVAL;
-
spin_lock_irq(&ctx->wqh.lock);
- res = -EAGAIN;
- if (ctx->count > 0)
- res = sizeof(ucnt);
- else if (!(file->f_flags & O_NONBLOCK)) {
+ if (!ctx->count) {
+ if ((file->f_flags & O_NONBLOCK) ||
+ (iocb->ki_flags & IOCB_NOWAIT)) {
+ spin_unlock_irq(&ctx->wqh.lock);
+ return -EAGAIN;
+ }
__add_wait_queue(&ctx->wqh, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (ctx->count > 0) {
- res = sizeof(ucnt);
+ if (ctx->count)
break;
- }
if (signal_pending(current)) {
- res = -ERESTARTSYS;
- break;
+ __remove_wait_queue(&ctx->wqh, &wait);
+ __set_current_state(TASK_RUNNING);
+ spin_unlock_irq(&ctx->wqh.lock);
+ return -ERESTARTSYS;
}
spin_unlock_irq(&ctx->wqh.lock);
schedule();
@@ -250,17 +251,14 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
__remove_wait_queue(&ctx->wqh, &wait);
__set_current_state(TASK_RUNNING);
}
- if (likely(res > 0)) {
- eventfd_ctx_do_read(ctx, &ucnt);
- if (waitqueue_active(&ctx->wqh))
- wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
- }
+ eventfd_ctx_do_read(ctx, &ucnt);
+ if (waitqueue_active(&ctx->wqh))
+ wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
spin_unlock_irq(&ctx->wqh.lock);
-
- if (res > 0 && put_user(ucnt, (__u64 __user *)buf))
+ if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
return -EFAULT;
- return res;
+ return sizeof(ucnt);
}
static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
@@ -329,7 +327,7 @@ static const struct file_operations eventfd_fops = {
#endif
.release = eventfd_release,
.poll = eventfd_poll,
- .read = eventfd_read,
+ .read_iter = eventfd_read,
.write = eventfd_write,
.llseek = noop_llseek,
};
@@ -406,6 +404,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
static int do_eventfd(unsigned int count, int flags)
{
struct eventfd_ctx *ctx;
+ struct file *file;
int fd;
/* Check the EFD_* constants for consistency. */
@@ -425,11 +424,24 @@ static int do_eventfd(unsigned int count, int flags)
ctx->flags = flags;
ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
- fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
- O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
+ flags &= EFD_SHARED_FCNTL_FLAGS;
+ flags |= O_RDWR;
+ fd = get_unused_fd_flags(flags);
if (fd < 0)
- eventfd_free_ctx(ctx);
+ goto err;
+
+ file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ fd = PTR_ERR(file);
+ goto err;
+ }
+ file->f_mode |= FMODE_NOWAIT;
+ fd_install(fd, file);
+ return fd;
+err:
+ eventfd_free_ctx(ctx);
return fd;
}
diff --git a/fs/exec.c b/fs/exec.c
index 02d0c5d19be5..e6e8a9a70327 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -252,7 +252,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
return -ENOMEM;
vma_set_anonymous(vma);
- if (down_write_killable(&mm->mmap_sem)) {
+ if (mmap_write_lock_killable(mm)) {
err = -EINTR;
goto err_free;
}
@@ -274,11 +274,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
goto err;
mm->stack_vm = mm->total_vm = 1;
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
bprm->p = vma->vm_end - sizeof(void *);
return 0;
err:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
err_free:
bprm->vma = NULL;
vm_area_free(vma);
@@ -763,7 +763,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
bprm->loader -= stack_shift;
bprm->exec -= stack_shift;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
vm_flags = VM_STACK_FLAGS;
@@ -825,7 +825,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
ret = -EFAULT;
out_unlock:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
EXPORT_SYMBOL(setup_arg_pages);
@@ -1091,12 +1091,12 @@ static int exec_mmap(struct mm_struct *mm)
/*
* Make sure that if there is a core dump in progress
* for the old mm, we get out and die instead of going
- * through with the exec. We must hold mmap_sem around
+ * through with the exec. We must hold mmap_lock around
* checking core_state and changing tsk->mm.
*/
- down_read(&old_mm->mmap_sem);
+ mmap_read_lock(old_mm);
if (unlikely(old_mm->core_state)) {
- up_read(&old_mm->mmap_sem);
+ mmap_read_unlock(old_mm);
mutex_unlock(&tsk->signal->exec_update_mutex);
return -EINTR;
}
@@ -1112,7 +1112,7 @@ static int exec_mmap(struct mm_struct *mm)
vmacache_flush(tsk);
task_unlock(tsk);
if (old_mm) {
- up_read(&old_mm->mmap_sem);
+ mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig
index 2d3636dc5b8c..5a65071b5ecf 100644
--- a/fs/exfat/Kconfig
+++ b/fs/exfat/Kconfig
@@ -16,6 +16,7 @@ config EXFAT_DEFAULT_IOCHARSET
depends on EXFAT_FS
help
Set this to the default input/output character set to use for
- converting between the encoding is used for user visible filename and
- UTF-16 character that exfat filesystem use, and can be overridden with
- the "iocharset" mount option for exFAT filesystems.
+ converting between the encoding that is used for user visible
+ filenames and the UTF-16 character encoding that the exFAT
+ filesystem uses. This can be overridden with the "iocharset" mount
+ option for the exFAT filesystems.
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 6774a5a6ded8..4055eb00ea9b 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -58,9 +58,8 @@ static int exfat_allocate_bitmap(struct super_block *sb,
need_map_size = ((EXFAT_DATA_CLUSTER_COUNT(sbi) - 1) / BITS_PER_BYTE)
+ 1;
if (need_map_size != map_size) {
- exfat_msg(sb, KERN_ERR,
- "bogus allocation bitmap size(need : %u, cur : %lld)",
- need_map_size, map_size);
+ exfat_err(sb, "bogus allocation bitmap size(need : %u, cur : %lld)",
+ need_map_size, map_size);
/*
* Only allowed when bogus allocation
* bitmap size is large
@@ -192,8 +191,7 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu)
(1 << sbi->sect_per_clus_bits), GFP_NOFS, 0);
if (ret_discard == -EOPNOTSUPP) {
- exfat_msg(sb, KERN_ERR,
- "discard not supported by device, disabling");
+ exfat_err(sb, "discard not supported by device, disabling");
opts->discard = 0;
}
}
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 4b91afb0f051..de43534aa299 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -32,35 +32,30 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
struct exfat_chain *p_dir, int entry, unsigned short *uniname)
{
int i;
- struct exfat_dentry *ep;
struct exfat_entry_set_cache *es;
- es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES, &ep);
+ es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES);
if (!es)
return;
- if (es->num_entries < 3)
- goto free_es;
-
- ep += 2;
-
/*
* First entry : file entry
* Second entry : stream-extension entry
* Third entry : first file-name entry
* So, the index of first file-name dentry should start from 2.
*/
- for (i = 2; i < es->num_entries; i++, ep++) {
+ for (i = 2; i < es->num_entries; i++) {
+ struct exfat_dentry *ep = exfat_get_dentry_cached(es, i);
+
/* end of name entry */
if (exfat_get_entry_type(ep) != TYPE_EXTEND)
- goto free_es;
+ break;
exfat_extract_uni_name(ep, uniname);
uniname += EXFAT_FILE_NAME_LEN;
}
-free_es:
- kfree(es);
+ exfat_free_dentry_set(es, false);
}
/* read a directory entry from the opened directory */
@@ -137,12 +132,12 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
ep->dentry.file.create_tz,
ep->dentry.file.create_time,
ep->dentry.file.create_date,
- ep->dentry.file.create_time_ms);
+ ep->dentry.file.create_time_cs);
exfat_get_entry_time(sbi, &dir_entry->mtime,
ep->dentry.file.modify_tz,
ep->dentry.file.modify_time,
ep->dentry.file.modify_date,
- ep->dentry.file.modify_time_ms);
+ ep->dentry.file.modify_time_cs);
exfat_get_entry_time(sbi, &dir_entry->atime,
ep->dentry.file.access_tz,
ep->dentry.file.access_time,
@@ -461,12 +456,12 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
&ep->dentry.file.create_tz,
&ep->dentry.file.create_time,
&ep->dentry.file.create_date,
- &ep->dentry.file.create_time_ms);
+ &ep->dentry.file.create_time_cs);
exfat_set_entry_time(sbi, &ts,
&ep->dentry.file.modify_tz,
&ep->dentry.file.modify_time,
&ep->dentry.file.modify_date,
- &ep->dentry.file.modify_time_ms);
+ &ep->dentry.file.modify_time_cs);
exfat_set_entry_time(sbi, &ts,
&ep->dentry.file.access_tz,
&ep->dentry.file.access_time,
@@ -496,7 +491,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
int ret = 0;
int i, num_entries;
sector_t sector;
- unsigned short chksum;
+ u16 chksum;
struct exfat_dentry *ep, *fep;
struct buffer_head *fbh, *bh;
@@ -505,7 +500,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
return -EIO;
num_entries = fep->dentry.file.num_ext + 1;
- chksum = exfat_calc_chksum_2byte(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY);
+ chksum = exfat_calc_chksum16(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY);
for (i = 1; i < num_entries; i++) {
ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, NULL);
@@ -513,7 +508,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
ret = -EIO;
goto release_fbh;
}
- chksum = exfat_calc_chksum_2byte(ep, DENTRY_SIZE, chksum,
+ chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum,
CS_DEFAULT);
brelse(bh);
}
@@ -590,62 +585,33 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
return 0;
}
-int exfat_update_dir_chksum_with_entry_set(struct super_block *sb,
- struct exfat_entry_set_cache *es, int sync)
+void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
{
- struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct buffer_head *bh;
- sector_t sec = es->sector;
- unsigned int off = es->offset;
- int chksum_type = CS_DIR_ENTRY, i, num_entries = es->num_entries;
- unsigned int buf_off = (off - es->offset);
- unsigned int remaining_byte_in_sector, copy_entries, clu;
+ int chksum_type = CS_DIR_ENTRY, i;
unsigned short chksum = 0;
+ struct exfat_dentry *ep;
- for (i = 0; i < num_entries; i++) {
- chksum = exfat_calc_chksum_2byte(&es->entries[i], DENTRY_SIZE,
- chksum, chksum_type);
+ for (i = 0; i < es->num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
+ chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum,
+ chksum_type);
chksum_type = CS_DEFAULT;
}
+ ep = exfat_get_dentry_cached(es, 0);
+ ep->dentry.file.checksum = cpu_to_le16(chksum);
+ es->modified = true;
+}
- es->entries[0].dentry.file.checksum = cpu_to_le16(chksum);
+void exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
+{
+ int i;
- while (num_entries) {
- /* write per sector base */
- remaining_byte_in_sector = (1 << sb->s_blocksize_bits) - off;
- copy_entries = min_t(int,
- EXFAT_B_TO_DEN(remaining_byte_in_sector),
- num_entries);
- bh = sb_bread(sb, sec);
- if (!bh)
- goto err_out;
- memcpy(bh->b_data + off,
- (unsigned char *)&es->entries[0] + buf_off,
- EXFAT_DEN_TO_B(copy_entries));
- exfat_update_bh(sb, bh, sync);
- brelse(bh);
- num_entries -= copy_entries;
-
- if (num_entries) {
- /* get next sector */
- if (exfat_is_last_sector_in_cluster(sbi, sec)) {
- clu = exfat_sector_to_cluster(sbi, sec);
- if (es->alloc_flag == ALLOC_NO_FAT_CHAIN)
- clu++;
- else if (exfat_get_next_cluster(sb, &clu))
- goto err_out;
- sec = exfat_cluster_to_sector(sbi, clu);
- } else {
- sec++;
- }
- off = 0;
- buf_off += EXFAT_DEN_TO_B(copy_entries);
- }
+ for (i = 0; i < es->num_bh; i++) {
+ if (es->modified)
+ exfat_update_bh(es->sb, es->bh[i], sync);
+ brelse(es->bh[i]);
}
-
- return 0;
-err_out:
- return -EIO;
+ kfree(es);
}
static int exfat_walk_fat_chain(struct super_block *sb,
@@ -720,9 +686,8 @@ static int exfat_dir_readahead(struct super_block *sb, sector_t sec)
return 0;
if (sec < sbi->data_start_sector) {
- exfat_msg(sb, KERN_ERR,
- "requested sector is invalid(sect:%llu, root:%llu)",
- (unsigned long long)sec, sbi->data_start_sector);
+ exfat_err(sb, "requested sector is invalid(sect:%llu, root:%llu)",
+ (unsigned long long)sec, sbi->data_start_sector);
return -EIO;
}
@@ -750,7 +715,7 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
sector_t sec;
if (p_dir->dir == DIR_DELETED) {
- exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry\n");
+ exfat_err(sb, "abnormal access to deleted dentry");
return NULL;
}
@@ -821,39 +786,45 @@ static bool exfat_validate_entry(unsigned int type,
}
}
+struct exfat_dentry *exfat_get_dentry_cached(
+ struct exfat_entry_set_cache *es, int num)
+{
+ int off = es->start_off + num * DENTRY_SIZE;
+ struct buffer_head *bh = es->bh[EXFAT_B_TO_BLK(off, es->sb)];
+ char *p = bh->b_data + EXFAT_BLK_OFFSET(off, es->sb);
+
+ return (struct exfat_dentry *)p;
+}
+
/*
* Returns a set of dentries for a file or dir.
*
- * Note that this is a copy (dump) of dentries so that user should
- * call write_entry_set() to apply changes made in this entry set
- * to the real device.
+ * Note It provides a direct pointer to bh->data via exfat_get_dentry_cached().
+ * User should call exfat_get_dentry_set() after setting 'modified' to apply
+ * changes made in this entry set to the real device.
*
* in:
* sb+p_dir+entry: indicates a file/dir
* type: specifies how many dentries should be included.
- * out:
- * file_ep: will point the first dentry(= file dentry) on success
* return:
* pointer of entry set on success,
* NULL on failure.
*/
struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, unsigned int type,
- struct exfat_dentry **file_ep)
+ struct exfat_chain *p_dir, int entry, unsigned int type)
{
- int ret;
+ int ret, i, num_bh;
unsigned int off, byte_offset, clu = 0;
- unsigned int entry_type;
sector_t sec;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_entry_set_cache *es;
- struct exfat_dentry *ep, *pos;
- unsigned char num_entries;
+ struct exfat_dentry *ep;
+ int num_entries;
enum exfat_validate_dentry_mode mode = ES_MODE_STARTED;
struct buffer_head *bh;
if (p_dir->dir == DIR_DELETED) {
- exfat_msg(sb, KERN_ERR, "access to deleted dentry\n");
+ exfat_err(sb, "access to deleted dentry");
return NULL;
}
@@ -862,11 +833,18 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
if (ret)
return NULL;
+ es = kzalloc(sizeof(*es), GFP_KERNEL);
+ if (!es)
+ return NULL;
+ es->sb = sb;
+ es->modified = false;
+
/* byte offset in cluster */
byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi);
/* byte offset in sector */
off = EXFAT_BLK_OFFSET(byte_offset, sb);
+ es->start_off = off;
/* sector offset in cluster */
sec = EXFAT_B_TO_BLK(byte_offset, sb);
@@ -874,72 +852,46 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
bh = sb_bread(sb, sec);
if (!bh)
- return NULL;
-
- ep = (struct exfat_dentry *)(bh->b_data + off);
- entry_type = exfat_get_entry_type(ep);
+ goto free_es;
+ es->bh[es->num_bh++] = bh;
- if (entry_type != TYPE_FILE && entry_type != TYPE_DIR)
- goto release_bh;
+ ep = exfat_get_dentry_cached(es, 0);
+ if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
+ goto free_es;
num_entries = type == ES_ALL_ENTRIES ?
ep->dentry.file.num_ext + 1 : type;
- es = kmalloc(struct_size(es, entries, num_entries), GFP_KERNEL);
- if (!es)
- goto release_bh;
-
es->num_entries = num_entries;
- es->sector = sec;
- es->offset = off;
- es->alloc_flag = p_dir->flags;
-
- pos = &es->entries[0];
-
- while (num_entries) {
- if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
- goto free_es;
-
- /* copy dentry */
- memcpy(pos, ep, sizeof(struct exfat_dentry));
-
- if (--num_entries == 0)
- break;
-
- if (((off + DENTRY_SIZE) & (sb->s_blocksize - 1)) <
- (off & (sb->s_blocksize - 1))) {
- /* get the next sector */
- if (exfat_is_last_sector_in_cluster(sbi, sec)) {
- if (es->alloc_flag == ALLOC_NO_FAT_CHAIN)
- clu++;
- else if (exfat_get_next_cluster(sb, &clu))
- goto free_es;
- sec = exfat_cluster_to_sector(sbi, clu);
- } else {
- sec++;
- }
- brelse(bh);
- bh = sb_bread(sb, sec);
- if (!bh)
+ num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
+ for (i = 1; i < num_bh; i++) {
+ /* get the next sector */
+ if (exfat_is_last_sector_in_cluster(sbi, sec)) {
+ if (p_dir->flags == ALLOC_NO_FAT_CHAIN)
+ clu++;
+ else if (exfat_get_next_cluster(sb, &clu))
goto free_es;
- off = 0;
- ep = (struct exfat_dentry *)bh->b_data;
+ sec = exfat_cluster_to_sector(sbi, clu);
} else {
- ep++;
- off += DENTRY_SIZE;
+ sec++;
}
- pos++;
+
+ bh = sb_bread(sb, sec);
+ if (!bh)
+ goto free_es;
+ es->bh[es->num_bh++] = bh;
}
- if (file_ep)
- *file_ep = &es->entries[0];
- brelse(bh);
+ /* validiate cached dentries */
+ for (i = 1; i < num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
+ if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
+ goto free_es;
+ }
return es;
free_es:
- kfree(es);
-release_bh:
- brelse(bh);
+ exfat_free_dentry_set(es, false);
return NULL;
}
@@ -1048,7 +1000,7 @@ rewind:
}
if (entry_type == TYPE_STREAM) {
- unsigned short name_hash;
+ u16 name_hash;
if (step != DIRENT_STEP_STRM) {
step = DIRENT_STEP_FILE;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index d67fb8a6f770..595f3117f492 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -71,10 +71,8 @@ enum {
#define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */
#define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE)
-#define FAT_CACHE_SIZE 128
-#define FAT_CACHE_HASH_SIZE 64
-#define BUF_CACHE_SIZE 256
-#define BUF_CACHE_HASH_SIZE 64
+/* Enough size to hold 256 dentry (even 512 Byte sector) */
+#define DIR_CACHE_SIZE (256*sizeof(struct exfat_dentry)/512+1)
#define EXFAT_HINT_NONE -1
#define EXFAT_MIN_SUBDIR 2
@@ -139,7 +137,7 @@ struct exfat_dentry_namebuf {
struct exfat_uni_name {
/* +3 for null and for converting */
unsigned short name[MAX_NAME_LENGTH + 3];
- unsigned short name_hash;
+ u16 name_hash;
unsigned char name_len;
};
@@ -170,14 +168,12 @@ struct exfat_hint {
};
struct exfat_entry_set_cache {
- /* sector number that contains file_entry */
- sector_t sector;
- /* byte offset in the sector */
- unsigned int offset;
- /* flag in stream entry. 01 for cluster chain, 03 for contig. */
- int alloc_flag;
+ struct super_block *sb;
+ bool modified;
+ unsigned int start_off;
+ int num_bh;
+ struct buffer_head *bh[DIR_CACHE_SIZE];
unsigned int num_entries;
- struct exfat_dentry entries[];
};
struct exfat_dir_entry {
@@ -231,7 +227,7 @@ struct exfat_sb_info {
unsigned int root_dir; /* root dir cluster */
unsigned int dentries_per_clu; /* num of dentries per cluster */
unsigned int vol_flag; /* volume dirty flag */
- struct buffer_head *pbr_bh; /* buffer_head of PBR sector */
+ struct buffer_head *boot_bh; /* buffer_head of BOOT sector */
unsigned int map_clu; /* allocation bitmap start cluster */
unsigned int map_sectors; /* num of allocation bitmap sectors */
@@ -451,8 +447,7 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
int entry, int order, int num_entries);
int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
int entry);
-int exfat_update_dir_chksum_with_entry_set(struct super_block *sb,
- struct exfat_entry_set_cache *es, int sync);
+void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es);
int exfat_calc_num_entries(struct exfat_uni_name *p_uniname);
int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
@@ -463,9 +458,11 @@ int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir,
struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
struct exfat_chain *p_dir, int entry, struct buffer_head **bh,
sector_t *sector);
+struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
+ int num);
struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, unsigned int type,
- struct exfat_dentry **file_ep);
+ struct exfat_chain *p_dir, int entry, unsigned int type);
+void exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
/* inode.c */
@@ -492,8 +489,6 @@ int exfat_nls_to_utf16(struct super_block *sb,
struct exfat_uni_name *uniname, int *p_lossy);
int exfat_create_upcase_table(struct super_block *sb);
void exfat_free_upcase_table(struct exfat_sb_info *sbi);
-unsigned short exfat_high_surrogate(unicode_t u);
-unsigned short exfat_low_surrogate(unicode_t u);
/* exfat/misc.c */
void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
@@ -505,13 +500,20 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
fmt, ## args)
void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...)
__printf(3, 4) __cold;
+#define exfat_err(sb, fmt, ...) \
+ exfat_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+#define exfat_warn(sb, fmt, ...) \
+ exfat_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define exfat_info(sb, fmt, ...) \
+ exfat_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__)
+
void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
- u8 tz, __le16 time, __le16 date, u8 time_ms);
+ u8 tz, __le16 time, __le16 date, u8 time_cs);
void exfat_truncate_atime(struct timespec64 *ts);
void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
- u8 *tz, __le16 *time, __le16 *date, u8 *time_ms);
-unsigned short exfat_calc_chksum_2byte(void *data, int len,
- unsigned short chksum, int type);
+ u8 *tz, __le16 *time, __le16 *date, u8 *time_cs);
+u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type);
+u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type);
void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync);
void exfat_chain_set(struct exfat_chain *ec, unsigned int dir,
unsigned int size, unsigned char flags);
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 2a841010e649..350ce59cc324 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -8,12 +8,15 @@
#include <linux/types.h>
-#define PBR_SIGNATURE 0xAA55
+#define BOOT_SIGNATURE 0xAA55
+#define EXBOOT_SIGNATURE 0xAA550000
+#define STR_EXFAT "EXFAT " /* size should be 8 */
#define EXFAT_MAX_FILE_LEN 255
#define VOL_CLEAN 0x0000
#define VOL_DIRTY 0x0002
+#define ERR_MEDIUM 0x0004
#define EXFAT_EOF_CLUSTER 0xFFFFFFFFu
#define EXFAT_BAD_CLUSTER 0xFFFFFFF7u
@@ -55,7 +58,7 @@
/* checksum types */
#define CS_DIR_ENTRY 0
-#define CS_PBR_SECTOR 1
+#define CS_BOOT_SECTOR 1
#define CS_DEFAULT 2
/* file attributes */
@@ -69,57 +72,35 @@
#define ATTR_RWMASK (ATTR_HIDDEN | ATTR_SYSTEM | ATTR_VOLUME | \
ATTR_SUBDIR | ATTR_ARCHIVE)
-#define PBR64_JUMP_BOOT_LEN 3
-#define PBR64_OEM_NAME_LEN 8
-#define PBR64_RESERVED_LEN 53
+#define BOOTSEC_JUMP_BOOT_LEN 3
+#define BOOTSEC_FS_NAME_LEN 8
+#define BOOTSEC_OLDBPB_LEN 53
#define EXFAT_FILE_NAME_LEN 15
-/* EXFAT BIOS parameter block (64 bytes) */
-struct bpb64 {
- __u8 jmp_boot[PBR64_JUMP_BOOT_LEN];
- __u8 oem_name[PBR64_OEM_NAME_LEN];
- __u8 res_zero[PBR64_RESERVED_LEN];
-} __packed;
-
-/* EXFAT EXTEND BIOS parameter block (56 bytes) */
-struct bsx64 {
- __le64 vol_offset;
- __le64 vol_length;
- __le32 fat_offset;
- __le32 fat_length;
- __le32 clu_offset;
- __le32 clu_count;
- __le32 root_cluster;
- __le32 vol_serial;
- __u8 fs_version[2];
- __le16 vol_flags;
- __u8 sect_size_bits;
- __u8 sect_per_clus_bits;
- __u8 num_fats;
- __u8 phy_drv_no;
- __u8 perc_in_use;
- __u8 reserved2[7];
-} __packed;
-
-/* EXFAT PBR[BPB+BSX] (120 bytes) */
-struct pbr64 {
- struct bpb64 bpb;
- struct bsx64 bsx;
-} __packed;
-
-/* Common PBR[Partition Boot Record] (512 bytes) */
-struct pbr {
- union {
- __u8 raw[64];
- struct bpb64 f64;
- } bpb;
- union {
- __u8 raw[56];
- struct bsx64 f64;
- } bsx;
- __u8 boot_code[390];
- __le16 signature;
+/* EXFAT: Main and Backup Boot Sector (512 bytes) */
+struct boot_sector {
+ __u8 jmp_boot[BOOTSEC_JUMP_BOOT_LEN];
+ __u8 fs_name[BOOTSEC_FS_NAME_LEN];
+ __u8 must_be_zero[BOOTSEC_OLDBPB_LEN];
+ __le64 partition_offset;
+ __le64 vol_length;
+ __le32 fat_offset;
+ __le32 fat_length;
+ __le32 clu_offset;
+ __le32 clu_count;
+ __le32 root_cluster;
+ __le32 vol_serial;
+ __u8 fs_revision[2];
+ __le16 vol_flags;
+ __u8 sect_size_bits;
+ __u8 sect_per_clus_bits;
+ __u8 num_fats;
+ __u8 drv_sel;
+ __u8 percent_in_use;
+ __u8 reserved[7];
+ __u8 boot_code[390];
+ __le16 signature;
} __packed;
struct exfat_dentry {
@@ -136,8 +117,8 @@ struct exfat_dentry {
__le16 modify_date;
__le16 access_time;
__le16 access_date;
- __u8 create_time_ms;
- __u8 modify_time_ms;
+ __u8 create_time_cs;
+ __u8 modify_time_cs;
__u8 create_tz;
__u8 modify_tz;
__u8 access_tz;
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index a855b1769a96..4e5c5c9c0f2d 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -169,9 +169,8 @@ int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain)
return 0;
/* check cluster validation */
- if (p_chain->dir < 2 && p_chain->dir >= sbi->num_clusters) {
- exfat_msg(sb, KERN_ERR, "invalid start cluster (%u)",
- p_chain->dir);
+ if (!is_valid_cluster(sbi, p_chain->dir)) {
+ exfat_err(sb, "invalid start cluster (%u)", p_chain->dir);
return -EIO;
}
@@ -305,8 +304,7 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
return 0;
release_bhs:
- exfat_msg(sb, KERN_ERR, "failed zeroed sect %llu\n",
- (unsigned long long)blknr);
+ exfat_err(sb, "failed zeroed sect %llu\n", (unsigned long long)blknr);
for (i = 0; i < n; i++)
bforget(bhs[i]);
return err;
@@ -337,9 +335,8 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
/* find new cluster */
if (hint_clu == EXFAT_EOF_CLUSTER) {
if (sbi->clu_srch_ptr < EXFAT_FIRST_CLUSTER) {
- exfat_msg(sb, KERN_ERR,
- "sbi->clu_srch_ptr is invalid (%u)\n",
- sbi->clu_srch_ptr);
+ exfat_err(sb, "sbi->clu_srch_ptr is invalid (%u)\n",
+ sbi->clu_srch_ptr);
sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
}
@@ -349,8 +346,8 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
}
/* check cluster validation */
- if (hint_clu < EXFAT_FIRST_CLUSTER && hint_clu >= sbi->num_clusters) {
- exfat_msg(sb, KERN_ERR, "hint_cluster is invalid (%u)\n",
+ if (!is_valid_cluster(sbi, hint_clu)) {
+ exfat_err(sb, "hint_cluster is invalid (%u)",
hint_clu);
hint_clu = EXFAT_FIRST_CLUSTER;
if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index c9db8eb0cfc3..fce03f318787 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -96,11 +96,9 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
unsigned int num_clusters_new, num_clusters_phys;
unsigned int last_clu = EXFAT_FREE_CLUSTER;
struct exfat_chain clu;
- struct exfat_dentry *ep, *ep2;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- struct exfat_entry_set_cache *es = NULL;
int evict = (ei->dir.dir == DIR_DELETED) ? 1 : 0;
/* check if the given file ID is opened */
@@ -153,28 +151,31 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
/* update the directory entry */
if (!evict) {
struct timespec64 ts;
+ struct exfat_dentry *ep, *ep2;
+ struct exfat_entry_set_cache *es;
es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
- ES_ALL_ENTRIES, &ep);
+ ES_ALL_ENTRIES);
if (!es)
return -EIO;
- ep2 = ep + 1;
+ ep = exfat_get_dentry_cached(es, 0);
+ ep2 = exfat_get_dentry_cached(es, 1);
ts = current_time(inode);
exfat_set_entry_time(sbi, &ts,
&ep->dentry.file.modify_tz,
&ep->dentry.file.modify_time,
&ep->dentry.file.modify_date,
- &ep->dentry.file.modify_time_ms);
+ &ep->dentry.file.modify_time_cs);
ep->dentry.file.attr = cpu_to_le16(ei->attr);
/* File size should be zero if there is no cluster allocated */
if (ei->start_clu == EXFAT_EOF_CLUSTER) {
- ep->dentry.stream.valid_size = 0;
- ep->dentry.stream.size = 0;
+ ep2->dentry.stream.valid_size = 0;
+ ep2->dentry.stream.size = 0;
} else {
- ep->dentry.stream.valid_size = cpu_to_le64(new_size);
- ep->dentry.stream.size = ep->dentry.stream.valid_size;
+ ep2->dentry.stream.valid_size = cpu_to_le64(new_size);
+ ep2->dentry.stream.size = ep->dentry.stream.valid_size;
}
if (new_size == 0) {
@@ -185,10 +186,8 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
}
- if (exfat_update_dir_chksum_with_entry_set(sb, es,
- inode_needs_sync(inode)))
- return -EIO;
- kfree(es);
+ exfat_update_dir_chksum_with_entry_set(es);
+ exfat_free_dentry_set(es, inode_needs_sync(inode));
}
/* cut off from the FAT chain */
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 785ead346543..cf9ca6c4d046 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -19,7 +19,6 @@
static int __exfat_write_inode(struct inode *inode, int sync)
{
- int ret = -EIO;
unsigned long long on_disk_size;
struct exfat_dentry *ep, *ep2;
struct exfat_entry_set_cache *es = NULL;
@@ -43,11 +42,11 @@ static int __exfat_write_inode(struct inode *inode, int sync)
exfat_set_vol_flags(sb, VOL_DIRTY);
/* get the directory entry of given file or directory */
- es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES,
- &ep);
+ es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES);
if (!es)
return -EIO;
- ep2 = ep + 1;
+ ep = exfat_get_dentry_cached(es, 0);
+ ep2 = exfat_get_dentry_cached(es, 1);
ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode));
@@ -56,12 +55,12 @@ static int __exfat_write_inode(struct inode *inode, int sync)
&ep->dentry.file.create_tz,
&ep->dentry.file.create_time,
&ep->dentry.file.create_date,
- &ep->dentry.file.create_time_ms);
+ &ep->dentry.file.create_time_cs);
exfat_set_entry_time(sbi, &inode->i_mtime,
&ep->dentry.file.modify_tz,
&ep->dentry.file.modify_time,
&ep->dentry.file.modify_date,
- &ep->dentry.file.modify_time_ms);
+ &ep->dentry.file.modify_time_cs);
exfat_set_entry_time(sbi, &inode->i_atime,
&ep->dentry.file.access_tz,
&ep->dentry.file.access_time,
@@ -77,9 +76,9 @@ static int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size);
ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
- ret = exfat_update_dir_chksum_with_entry_set(sb, es, sync);
- kfree(es);
- return ret;
+ exfat_update_dir_chksum_with_entry_set(es);
+ exfat_free_dentry_set(es, sync);
+ return 0;
}
int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -110,8 +109,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
int ret, modified = false;
unsigned int last_clu;
struct exfat_chain new_clu;
- struct exfat_dentry *ep;
- struct exfat_entry_set_cache *es = NULL;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
@@ -222,34 +219,28 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
num_clusters += num_to_be_allocated;
*clu = new_clu.dir;
- if (ei->dir.dir != DIR_DELETED) {
+ if (ei->dir.dir != DIR_DELETED && modified) {
+ struct exfat_dentry *ep;
+ struct exfat_entry_set_cache *es;
+
es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
- ES_ALL_ENTRIES, &ep);
+ ES_ALL_ENTRIES);
if (!es)
return -EIO;
/* get stream entry */
- ep++;
+ ep = exfat_get_dentry_cached(es, 1);
/* update directory entry */
- if (modified) {
- if (ep->dentry.stream.flags != ei->flags)
- ep->dentry.stream.flags = ei->flags;
-
- if (le32_to_cpu(ep->dentry.stream.start_clu) !=
- ei->start_clu)
- ep->dentry.stream.start_clu =
- cpu_to_le32(ei->start_clu);
-
- ep->dentry.stream.valid_size =
- cpu_to_le64(i_size_read(inode));
- ep->dentry.stream.size =
- ep->dentry.stream.valid_size;
- }
-
- if (exfat_update_dir_chksum_with_entry_set(sb, es,
- inode_needs_sync(inode)))
- return -EIO;
- kfree(es);
+ ep->dentry.stream.flags = ei->flags;
+ ep->dentry.stream.start_clu =
+ cpu_to_le32(ei->start_clu);
+ ep->dentry.stream.valid_size =
+ cpu_to_le64(i_size_read(inode));
+ ep->dentry.stream.size =
+ ep->dentry.stream.valid_size;
+
+ exfat_update_dir_chksum_with_entry_set(es);
+ exfat_free_dentry_set(es, inode_needs_sync(inode));
} /* end of if != DIR_DELETED */
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index ebd2cbe3cbc1..17d41f3d3709 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -32,7 +32,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- exfat_msg(sb, KERN_ERR, "error, %pV\n", &vaf);
+ exfat_err(sb, "error, %pV", &vaf);
va_end(args);
}
@@ -41,7 +41,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
sb->s_id);
} else if (opts->errors == EXFAT_ERRORS_RO && !sb_rdonly(sb)) {
sb->s_flags |= SB_RDONLY;
- exfat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
+ exfat_err(sb, "Filesystem has been set read-only");
}
}
@@ -75,7 +75,7 @@ static void exfat_adjust_tz(struct timespec64 *ts, u8 tz_off)
/* Convert a EXFAT time/date pair to a UNIX date (seconds since 1 1 70). */
void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
- u8 tz, __le16 time, __le16 date, u8 time_ms)
+ u8 tz, __le16 time, __le16 date, u8 time_cs)
{
u16 t = le16_to_cpu(time);
u16 d = le16_to_cpu(date);
@@ -84,10 +84,10 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
t >> 11, (t >> 5) & 0x003F, (t & 0x001F) << 1);
- /* time_ms field represent 0 ~ 199(1990 ms) */
- if (time_ms) {
- ts->tv_sec += time_ms / 100;
- ts->tv_nsec = (time_ms % 100) * 10 * NSEC_PER_MSEC;
+ /* time_cs field represent 0 ~ 199cs(1990 ms) */
+ if (time_cs) {
+ ts->tv_sec += time_cs / 100;
+ ts->tv_nsec = (time_cs % 100) * 10 * NSEC_PER_MSEC;
} else
ts->tv_nsec = 0;
@@ -101,7 +101,7 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
/* Convert linear UNIX date to a EXFAT time/date pair. */
void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
- u8 *tz, __le16 *time, __le16 *date, u8 *time_ms)
+ u8 *tz, __le16 *time, __le16 *date, u8 *time_cs)
{
struct tm tm;
u16 t, d;
@@ -113,9 +113,9 @@ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
*time = cpu_to_le16(t);
*date = cpu_to_le16(d);
- /* time_ms field represent 0 ~ 199(1990 ms) */
- if (time_ms)
- *time_ms = (tm.tm_sec & 1) * 100 +
+ /* time_cs field represent 0 ~ 199cs(1990 ms) */
+ if (time_cs)
+ *time_cs = (tm.tm_sec & 1) * 100 +
ts->tv_nsec / (10 * NSEC_PER_MSEC);
/*
@@ -136,17 +136,29 @@ void exfat_truncate_atime(struct timespec64 *ts)
ts->tv_nsec = 0;
}
-unsigned short exfat_calc_chksum_2byte(void *data, int len,
- unsigned short chksum, int type)
+u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type)
{
int i;
- unsigned char *c = (unsigned char *)data;
+ u8 *c = (u8 *)data;
for (i = 0; i < len; i++, c++) {
- if (((i == 2) || (i == 3)) && (type == CS_DIR_ENTRY))
+ if (unlikely(type == CS_DIR_ENTRY && (i == 2 || i == 3)))
continue;
- chksum = (((chksum & 1) << 15) | ((chksum & 0xFFFE) >> 1)) +
- (unsigned short)*c;
+ chksum = ((chksum << 15) | (chksum >> 1)) + *c;
+ }
+ return chksum;
+}
+
+u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type)
+{
+ int i;
+ u8 *c = (u8 *)data;
+
+ for (i = 0; i < len; i++, c++) {
+ if (unlikely(type == CS_BOOT_SECTOR &&
+ (i == 106 || i == 107 || i == 112)))
+ continue;
+ chksum = ((chksum << 31) | (chksum >> 1)) + *c;
}
return chksum;
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index a2659a8a68a1..5b0f35329d63 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -147,16 +147,10 @@ static int exfat_utf8_d_hash(const struct dentry *dentry, struct qstr *qstr)
return charlen;
/*
- * Convert to UTF-16: code points above U+FFFF are encoded as
- * surrogate pairs.
* exfat_toupper() works only for code points up to the U+FFFF.
*/
- if (u > 0xFFFF) {
- hash = partial_name_hash(exfat_high_surrogate(u), hash);
- hash = partial_name_hash(exfat_low_surrogate(u), hash);
- } else {
- hash = partial_name_hash(exfat_toupper(sb, u), hash);
- }
+ hash = partial_name_hash(u <= 0xFFFF ? exfat_toupper(sb, u) : u,
+ hash);
}
qstr->hash = end_name_hash(hash);
@@ -185,14 +179,9 @@ static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len,
if (u_a <= 0xFFFF && u_b <= 0xFFFF) {
if (exfat_toupper(sb, u_a) != exfat_toupper(sb, u_b))
return 1;
- } else if (u_a > 0xFFFF && u_b > 0xFFFF) {
- if (exfat_low_surrogate(u_a) !=
- exfat_low_surrogate(u_b) ||
- exfat_high_surrogate(u_a) !=
- exfat_high_surrogate(u_b))
- return 1;
} else {
- return 1;
+ if (u_a != u_b)
+ return 1;
}
}
@@ -611,8 +600,6 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
int ret, dentry, num_entries, count;
struct exfat_chain cdir;
struct exfat_uni_name uni_name;
- struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es = NULL;
struct super_block *sb = dir->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(dir);
@@ -671,10 +658,14 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
info->num_subdirs = count;
} else {
- es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES, &ep);
+ struct exfat_dentry *ep, *ep2;
+ struct exfat_entry_set_cache *es;
+
+ es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES);
if (!es)
return -EIO;
- ep2 = ep + 1;
+ ep = exfat_get_dentry_cached(es, 0);
+ ep2 = exfat_get_dentry_cached(es, 1);
info->type = exfat_get_entry_type(ep);
info->attr = le16_to_cpu(ep->dentry.file.attr);
@@ -692,7 +683,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
exfat_fs_error(sb,
"non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)",
i_size_read(dir), ei->dir.dir, ei->entry);
- kfree(es);
+ exfat_free_dentry_set(es, false);
return -EIO;
}
@@ -700,18 +691,18 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
ep->dentry.file.create_tz,
ep->dentry.file.create_time,
ep->dentry.file.create_date,
- ep->dentry.file.create_time_ms);
+ ep->dentry.file.create_time_cs);
exfat_get_entry_time(sbi, &info->mtime,
ep->dentry.file.modify_tz,
ep->dentry.file.modify_time,
ep->dentry.file.modify_date,
- ep->dentry.file.modify_time_ms);
+ ep->dentry.file.modify_time_cs);
exfat_get_entry_time(sbi, &info->atime,
ep->dentry.file.access_tz,
ep->dentry.file.access_time,
ep->dentry.file.access_date,
0);
- kfree(es);
+ exfat_free_dentry_set(es, false);
if (info->type == TYPE_DIR) {
exfat_chain_set(&cdir, info->start_clu,
@@ -778,8 +769,8 @@ static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry,
if (d_unhashed(alias)) {
WARN_ON(alias->d_name.hash_len !=
dentry->d_name.hash_len);
- exfat_msg(sb, KERN_INFO,
- "rehashed a dentry(%p) in read lookup", alias);
+ exfat_info(sb, "rehashed a dentry(%p) in read lookup",
+ alias);
d_drop(dentry);
d_rehash(alias);
} else if (!S_ISDIR(i_mode)) {
@@ -824,7 +815,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
exfat_chain_dup(&cdir, &ei->dir);
entry = ei->entry;
if (ei->dir.dir == DIR_DELETED) {
- exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry");
+ exfat_err(sb, "abnormal access to deleted dentry");
err = -ENOENT;
goto unlock;
}
@@ -979,7 +970,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
entry = ei->entry;
if (ei->dir.dir == DIR_DELETED) {
- exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry");
+ exfat_err(sb, "abnormal access to deleted dentry");
err = -ENOENT;
goto unlock;
}
@@ -991,9 +982,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
err = exfat_check_dir_empty(sb, &clu_to_free);
if (err) {
if (err == -EIO)
- exfat_msg(sb, KERN_ERR,
- "failed to exfat_check_dir_empty : err(%d)",
- err);
+ exfat_err(sb, "failed to exfat_check_dir_empty : err(%d)",
+ err);
goto unlock;
}
@@ -1014,9 +1004,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries);
if (err) {
- exfat_msg(sb, KERN_ERR,
- "failed to exfat_remove_entries : err(%d)",
- err);
+ exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err);
goto unlock;
}
ei->dir.dir = DIR_DELETED;
@@ -1245,8 +1233,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
return -EINVAL;
if (ei->dir.dir == DIR_DELETED) {
- exfat_msg(sb, KERN_ERR,
- "abnormal access to deleted source dentry");
+ exfat_err(sb, "abnormal access to deleted source dentry");
return -ENOENT;
}
@@ -1268,8 +1255,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
new_ei = EXFAT_I(new_inode);
if (new_ei->dir.dir == DIR_DELETED) {
- exfat_msg(sb, KERN_ERR,
- "abnormal access to deleted target dentry");
+ exfat_err(sb, "abnormal access to deleted target dentry");
goto out;
}
@@ -1431,8 +1417,7 @@ static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(new_inode->i_mode))
drop_nlink(new_inode);
} else {
- exfat_msg(sb, KERN_WARNING,
- "abnormal access to an inode dropped");
+ exfat_warn(sb, "abnormal access to an inode dropped");
WARN_ON(new_inode->i_nlink == 0);
}
new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime =
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 6d1c3ae130ff..57b5a7a4d1f7 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -503,21 +503,17 @@ static int exfat_utf8_to_utf16(struct super_block *sb,
unilen = utf8s_to_utf16s(p_cstring, len, UTF16_HOST_ENDIAN,
(wchar_t *)uniname, MAX_NAME_LENGTH + 2);
if (unilen < 0) {
- exfat_msg(sb, KERN_ERR,
- "failed to %s (err : %d) nls len : %d",
- __func__, unilen, len);
+ exfat_err(sb, "failed to %s (err : %d) nls len : %d",
+ __func__, unilen, len);
return unilen;
}
if (unilen > MAX_NAME_LENGTH) {
- exfat_msg(sb, KERN_ERR,
- "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d",
- __func__, len, unilen, MAX_NAME_LENGTH);
+ exfat_err(sb, "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d",
+ __func__, len, unilen, MAX_NAME_LENGTH);
return -ENAMETOOLONG;
}
- p_uniname->name_len = unilen & 0xFF;
-
for (i = 0; i < unilen; i++) {
if (*uniname < 0x0020 ||
exfat_wstrchr(bad_uni_chars, *uniname))
@@ -529,7 +525,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb,
*uniname = '\0';
p_uniname->name_len = unilen;
- p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0,
+ p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0,
CS_DEFAULT);
if (p_lossy)
@@ -537,22 +533,9 @@ static int exfat_utf8_to_utf16(struct super_block *sb,
return unilen;
}
-#define PLANE_SIZE 0x00010000
#define SURROGATE_MASK 0xfffff800
#define SURROGATE_PAIR 0x0000d800
#define SURROGATE_LOW 0x00000400
-#define SURROGATE_BITS 0x000003ff
-
-unsigned short exfat_high_surrogate(unicode_t u)
-{
- return ((u - PLANE_SIZE) >> 10) + SURROGATE_PAIR;
-}
-
-unsigned short exfat_low_surrogate(unicode_t u)
-{
- return ((u - PLANE_SIZE) & SURROGATE_BITS) | SURROGATE_PAIR |
- SURROGATE_LOW;
-}
static int __exfat_utf16_to_nls(struct super_block *sb,
struct exfat_uni_name *p_uniname, unsigned char *p_cstring,
@@ -638,7 +621,7 @@ static int exfat_nls_to_ucs2(struct super_block *sb,
*uniname = '\0';
p_uniname->name_len = unilen;
- p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0,
+ p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0,
CS_DEFAULT);
if (p_lossy)
@@ -670,7 +653,8 @@ static int exfat_load_upcase_table(struct super_block *sb,
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
unsigned int sect_size = sb->s_blocksize;
- unsigned int i, index = 0, checksum = 0;
+ unsigned int i, index = 0;
+ u32 chksum = 0;
int ret;
unsigned char skip = false;
unsigned short *upcase_table;
@@ -687,9 +671,8 @@ static int exfat_load_upcase_table(struct super_block *sb,
bh = sb_bread(sb, sector);
if (!bh) {
- exfat_msg(sb, KERN_ERR,
- "failed to read sector(0x%llx)\n",
- (unsigned long long)sector);
+ exfat_err(sb, "failed to read sector(0x%llx)\n",
+ (unsigned long long)sector);
ret = -EIO;
goto free_table;
}
@@ -697,13 +680,6 @@ static int exfat_load_upcase_table(struct super_block *sb,
for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) {
unsigned short uni = get_unaligned_le16(bh->b_data + i);
- checksum = ((checksum & 1) ? 0x80000000 : 0) +
- (checksum >> 1) +
- *(((unsigned char *)bh->b_data) + i);
- checksum = ((checksum & 1) ? 0x80000000 : 0) +
- (checksum >> 1) +
- *(((unsigned char *)bh->b_data) + (i + 1));
-
if (skip) {
index += uni;
skip = false;
@@ -716,15 +692,15 @@ static int exfat_load_upcase_table(struct super_block *sb,
index++;
}
}
+ chksum = exfat_calc_chksum32(bh->b_data, i, chksum, CS_DEFAULT);
brelse(bh);
}
- if (index >= 0xFFFF && utbl_checksum == checksum)
+ if (index >= 0xFFFF && utbl_checksum == chksum)
return 0;
- exfat_msg(sb, KERN_ERR,
- "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)\n",
- index, checksum, utbl_checksum);
+ exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)",
+ index, chksum, utbl_checksum);
ret = -EINVAL;
free_table:
exfat_free_upcase_table(sbi);
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index a846ff555656..e650e65536f8 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -49,7 +49,7 @@ static void exfat_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
exfat_set_vol_flags(sb, VOL_CLEAN);
exfat_free_bitmap(sbi);
- brelse(sbi->pbr_bh);
+ brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
call_rcu(&sbi->rcu, exfat_delayed_free);
@@ -101,8 +101,8 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct pbr64 *bpb = (struct pbr64 *)sbi->pbr_bh->b_data;
- bool sync = 0;
+ struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data;
+ bool sync;
/* flags are not changed */
if (sbi->vol_flag == new_flag)
@@ -116,18 +116,18 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
if (sb_rdonly(sb))
return 0;
- bpb->bsx.vol_flags = cpu_to_le16(new_flag);
+ p_boot->vol_flags = cpu_to_le16(new_flag);
- if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->pbr_bh))
+ if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->boot_bh))
sync = true;
else
sync = false;
- set_buffer_uptodate(sbi->pbr_bh);
- mark_buffer_dirty(sbi->pbr_bh);
+ set_buffer_uptodate(sbi->boot_bh);
+ mark_buffer_dirty(sbi->boot_bh);
if (sync)
- sync_dirty_buffer(sbi->pbr_bh);
+ sync_dirty_buffer(sbi->boot_bh);
return 0;
}
@@ -273,9 +273,8 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_charset:
exfat_free_iocharset(sbi);
- opts->iocharset = kstrdup(param->string, GFP_KERNEL);
- if (!opts->iocharset)
- return -ENOMEM;
+ opts->iocharset = param->string;
+ param->string = NULL;
break;
case Opt_errors:
opts->errors = result.uint_32;
@@ -366,151 +365,208 @@ static int exfat_read_root(struct inode *inode)
return 0;
}
-static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb)
+static int exfat_calibrate_blocksize(struct super_block *sb, int logical_sect)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct pbr *p_pbr = (struct pbr *) (sbi->pbr_bh)->b_data;
- unsigned short logical_sect = 0;
-
- logical_sect = 1 << p_pbr->bsx.f64.sect_size_bits;
if (!is_power_of_2(logical_sect) ||
logical_sect < 512 || logical_sect > 4096) {
- exfat_msg(sb, KERN_ERR, "bogus logical sector size %u",
- logical_sect);
- return NULL;
+ exfat_err(sb, "bogus logical sector size %u", logical_sect);
+ return -EIO;
}
if (logical_sect < sb->s_blocksize) {
- exfat_msg(sb, KERN_ERR,
- "logical sector size too small for device (logical sector size = %u)",
- logical_sect);
- return NULL;
+ exfat_err(sb, "logical sector size too small for device (logical sector size = %u)",
+ logical_sect);
+ return -EIO;
}
if (logical_sect > sb->s_blocksize) {
- brelse(sbi->pbr_bh);
- sbi->pbr_bh = NULL;
+ brelse(sbi->boot_bh);
+ sbi->boot_bh = NULL;
if (!sb_set_blocksize(sb, logical_sect)) {
- exfat_msg(sb, KERN_ERR,
- "unable to set blocksize %u", logical_sect);
- return NULL;
+ exfat_err(sb, "unable to set blocksize %u",
+ logical_sect);
+ return -EIO;
}
- sbi->pbr_bh = sb_bread(sb, 0);
- if (!sbi->pbr_bh) {
- exfat_msg(sb, KERN_ERR,
- "unable to read boot sector (logical sector size = %lu)",
- sb->s_blocksize);
- return NULL;
+ sbi->boot_bh = sb_bread(sb, 0);
+ if (!sbi->boot_bh) {
+ exfat_err(sb, "unable to read boot sector (logical sector size = %lu)",
+ sb->s_blocksize);
+ return -EIO;
}
-
- p_pbr = (struct pbr *)sbi->pbr_bh->b_data;
}
- return p_pbr;
+ return 0;
}
-/* mount the file system volume */
-static int __exfat_fill_super(struct super_block *sb)
+static int exfat_read_boot_sector(struct super_block *sb)
{
- int ret;
- struct pbr *p_pbr;
- struct pbr64 *p_bpb;
+ struct boot_sector *p_boot;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
/* set block size to read super block */
sb_min_blocksize(sb, 512);
/* read boot sector */
- sbi->pbr_bh = sb_bread(sb, 0);
- if (!sbi->pbr_bh) {
- exfat_msg(sb, KERN_ERR, "unable to read boot sector");
+ sbi->boot_bh = sb_bread(sb, 0);
+ if (!sbi->boot_bh) {
+ exfat_err(sb, "unable to read boot sector");
return -EIO;
}
+ p_boot = (struct boot_sector *)sbi->boot_bh->b_data;
- /* PRB is read */
- p_pbr = (struct pbr *)sbi->pbr_bh->b_data;
-
- /* check the validity of PBR */
- if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) {
- exfat_msg(sb, KERN_ERR, "invalid boot record signature");
- ret = -EINVAL;
- goto free_bh;
+ /* check the validity of BOOT */
+ if (le16_to_cpu((p_boot->signature)) != BOOT_SIGNATURE) {
+ exfat_err(sb, "invalid boot record signature");
+ return -EINVAL;
}
-
- /* check logical sector size */
- p_pbr = exfat_read_pbr_with_logical_sector(sb);
- if (!p_pbr) {
- ret = -EIO;
- goto free_bh;
+ if (memcmp(p_boot->fs_name, STR_EXFAT, BOOTSEC_FS_NAME_LEN)) {
+ exfat_err(sb, "invalid fs_name"); /* fs_name may unprintable */
+ return -EINVAL;
}
/*
- * res_zero field must be filled with zero to prevent mounting
+ * must_be_zero field must be filled with zero to prevent mounting
* from FAT volume.
*/
- if (memchr_inv(p_pbr->bpb.f64.res_zero, 0,
- sizeof(p_pbr->bpb.f64.res_zero))) {
- ret = -EINVAL;
- goto free_bh;
- }
+ if (memchr_inv(p_boot->must_be_zero, 0, sizeof(p_boot->must_be_zero)))
+ return -EINVAL;
- p_bpb = (struct pbr64 *)p_pbr;
- if (!p_bpb->bsx.num_fats) {
- exfat_msg(sb, KERN_ERR, "bogus number of FAT structure");
- ret = -EINVAL;
- goto free_bh;
+ if (p_boot->num_fats != 1 && p_boot->num_fats != 2) {
+ exfat_err(sb, "bogus number of FAT structure");
+ return -EINVAL;
}
- sbi->sect_per_clus = 1 << p_bpb->bsx.sect_per_clus_bits;
- sbi->sect_per_clus_bits = p_bpb->bsx.sect_per_clus_bits;
- sbi->cluster_size_bits = sbi->sect_per_clus_bits + sb->s_blocksize_bits;
+ sbi->sect_per_clus = 1 << p_boot->sect_per_clus_bits;
+ sbi->sect_per_clus_bits = p_boot->sect_per_clus_bits;
+ sbi->cluster_size_bits = p_boot->sect_per_clus_bits +
+ p_boot->sect_size_bits;
sbi->cluster_size = 1 << sbi->cluster_size_bits;
- sbi->num_FAT_sectors = le32_to_cpu(p_bpb->bsx.fat_length);
- sbi->FAT1_start_sector = le32_to_cpu(p_bpb->bsx.fat_offset);
- sbi->FAT2_start_sector = p_bpb->bsx.num_fats == 1 ?
- sbi->FAT1_start_sector :
- sbi->FAT1_start_sector + sbi->num_FAT_sectors;
- sbi->data_start_sector = le32_to_cpu(p_bpb->bsx.clu_offset);
- sbi->num_sectors = le64_to_cpu(p_bpb->bsx.vol_length);
+ sbi->num_FAT_sectors = le32_to_cpu(p_boot->fat_length);
+ sbi->FAT1_start_sector = le32_to_cpu(p_boot->fat_offset);
+ sbi->FAT2_start_sector = le32_to_cpu(p_boot->fat_offset);
+ if (p_boot->num_fats == 2)
+ sbi->FAT2_start_sector += sbi->num_FAT_sectors;
+ sbi->data_start_sector = le32_to_cpu(p_boot->clu_offset);
+ sbi->num_sectors = le64_to_cpu(p_boot->vol_length);
/* because the cluster index starts with 2 */
- sbi->num_clusters = le32_to_cpu(p_bpb->bsx.clu_count) +
+ sbi->num_clusters = le32_to_cpu(p_boot->clu_count) +
EXFAT_RESERVED_CLUSTERS;
- sbi->root_dir = le32_to_cpu(p_bpb->bsx.root_cluster);
+ sbi->root_dir = le32_to_cpu(p_boot->root_cluster);
sbi->dentries_per_clu = 1 <<
(sbi->cluster_size_bits - DENTRY_SIZE_BITS);
- sbi->vol_flag = le16_to_cpu(p_bpb->bsx.vol_flags);
+ sbi->vol_flag = le16_to_cpu(p_boot->vol_flags);
sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED;
- if (le16_to_cpu(p_bpb->bsx.vol_flags) & VOL_DIRTY) {
- sbi->vol_flag |= VOL_DIRTY;
- exfat_msg(sb, KERN_WARNING,
- "Volume was not properly unmounted. Some data may be corrupt. Please run fsck.");
+ /* check consistencies */
+ if (sbi->num_FAT_sectors << p_boot->sect_size_bits <
+ sbi->num_clusters * 4) {
+ exfat_err(sb, "bogus fat length");
+ return -EINVAL;
}
+ if (sbi->data_start_sector <
+ sbi->FAT1_start_sector + sbi->num_FAT_sectors * p_boot->num_fats) {
+ exfat_err(sb, "bogus data start sector");
+ return -EINVAL;
+ }
+ if (sbi->vol_flag & VOL_DIRTY)
+ exfat_warn(sb, "Volume was not properly unmounted. Some data may be corrupt. Please run fsck.");
+ if (sbi->vol_flag & ERR_MEDIUM)
+ exfat_warn(sb, "Medium has reported failures. Some data may be lost.");
/* exFAT file size is limited by a disk volume size */
sb->s_maxbytes = (u64)(sbi->num_clusters - EXFAT_RESERVED_CLUSTERS) <<
sbi->cluster_size_bits;
+ /* check logical sector size */
+ if (exfat_calibrate_blocksize(sb, 1 << p_boot->sect_size_bits))
+ return -EIO;
+
+ return 0;
+}
+
+static int exfat_verify_boot_region(struct super_block *sb)
+{
+ struct buffer_head *bh = NULL;
+ u32 chksum = 0;
+ __le32 *p_sig, *p_chksum;
+ int sn, i;
+
+ /* read boot sector sub-regions */
+ for (sn = 0; sn < 11; sn++) {
+ bh = sb_bread(sb, sn);
+ if (!bh)
+ return -EIO;
+
+ if (sn != 0 && sn <= 8) {
+ /* extended boot sector sub-regions */
+ p_sig = (__le32 *)&bh->b_data[sb->s_blocksize - 4];
+ if (le32_to_cpu(*p_sig) != EXBOOT_SIGNATURE)
+ exfat_warn(sb, "Invalid exboot-signature(sector = %d): 0x%08x",
+ sn, le32_to_cpu(*p_sig));
+ }
+
+ chksum = exfat_calc_chksum32(bh->b_data, sb->s_blocksize,
+ chksum, sn ? CS_DEFAULT : CS_BOOT_SECTOR);
+ brelse(bh);
+ }
+
+ /* boot checksum sub-regions */
+ bh = sb_bread(sb, sn);
+ if (!bh)
+ return -EIO;
+
+ for (i = 0; i < sb->s_blocksize; i += sizeof(u32)) {
+ p_chksum = (__le32 *)&bh->b_data[i];
+ if (le32_to_cpu(*p_chksum) != chksum) {
+ exfat_err(sb, "Invalid boot checksum (boot checksum : 0x%08x, checksum : 0x%08x)",
+ le32_to_cpu(*p_chksum), chksum);
+ brelse(bh);
+ return -EINVAL;
+ }
+ }
+ brelse(bh);
+ return 0;
+}
+
+/* mount the file system volume */
+static int __exfat_fill_super(struct super_block *sb)
+{
+ int ret;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ ret = exfat_read_boot_sector(sb);
+ if (ret) {
+ exfat_err(sb, "failed to read boot sector");
+ goto free_bh;
+ }
+
+ ret = exfat_verify_boot_region(sb);
+ if (ret) {
+ exfat_err(sb, "invalid boot region");
+ goto free_bh;
+ }
+
ret = exfat_create_upcase_table(sb);
if (ret) {
- exfat_msg(sb, KERN_ERR, "failed to load upcase table");
+ exfat_err(sb, "failed to load upcase table");
goto free_bh;
}
ret = exfat_load_bitmap(sb);
if (ret) {
- exfat_msg(sb, KERN_ERR, "failed to load alloc-bitmap");
+ exfat_err(sb, "failed to load alloc-bitmap");
goto free_upcase_table;
}
ret = exfat_count_used_clusters(sb, &sbi->used_clusters);
if (ret) {
- exfat_msg(sb, KERN_ERR, "failed to scan clusters");
+ exfat_err(sb, "failed to scan clusters");
goto free_alloc_bitmap;
}
@@ -521,7 +577,7 @@ free_alloc_bitmap:
free_upcase_table:
exfat_free_upcase_table(sbi);
free_bh:
- brelse(sbi->pbr_bh);
+ brelse(sbi->boot_bh);
return ret;
}
@@ -539,8 +595,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
struct request_queue *q = bdev_get_queue(sb->s_bdev);
if (!blk_queue_discard(q)) {
- exfat_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but the device does not support discard");
+ exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
opts->discard = 0;
}
}
@@ -555,7 +610,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
err = __exfat_fill_super(sb);
if (err) {
- exfat_msg(sb, KERN_ERR, "failed to recognize exfat type");
+ exfat_err(sb, "failed to recognize exfat type");
goto check_nls_io;
}
@@ -567,8 +622,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
else {
sbi->nls_io = load_nls(sbi->options.iocharset);
if (!sbi->nls_io) {
- exfat_msg(sb, KERN_ERR, "IO charset %s not found",
- sbi->options.iocharset);
+ exfat_err(sb, "IO charset %s not found",
+ sbi->options.iocharset);
err = -EINVAL;
goto free_table;
}
@@ -581,7 +636,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
root_inode = new_inode(sb);
if (!root_inode) {
- exfat_msg(sb, KERN_ERR, "failed to allocate root inode.");
+ exfat_err(sb, "failed to allocate root inode");
err = -ENOMEM;
goto free_table;
}
@@ -590,7 +645,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
inode_set_iversion(root_inode, 1);
err = exfat_read_root(root_inode);
if (err) {
- exfat_msg(sb, KERN_ERR, "failed to initialize root inode.");
+ exfat_err(sb, "failed to initialize root inode");
goto put_inode;
}
@@ -599,7 +654,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
- exfat_msg(sb, KERN_ERR, "failed to get the root dentry");
+ exfat_err(sb, "failed to get the root dentry");
err = -ENOMEM;
goto put_inode;
}
@@ -613,7 +668,7 @@ put_inode:
free_table:
exfat_free_upcase_table(sbi);
exfat_free_bitmap(sbi);
- brelse(sbi->pbr_bh);
+ brelse(sbi->boot_bh);
check_nls_io:
unload_nls(sbi->nls_io);
@@ -630,7 +685,12 @@ static int exfat_get_tree(struct fs_context *fc)
static void exfat_free(struct fs_context *fc)
{
- kfree(fc->s_fs_info);
+ struct exfat_sb_info *sbi = fc->s_fs_info;
+
+ if (sbi) {
+ exfat_free_iocharset(sbi);
+ kfree(sbi);
+ }
}
static const struct fs_context_operations exfat_context_ops = {
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index b4de9a0f170d..60378ddf1424 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -79,7 +79,7 @@ out_unlock:
/*
* The lock ordering for ext2 DAX fault paths is:
*
- * mmap_sem (MM)
+ * mmap_lock (MM)
* sb_start_pagefault (vfs, freeze)
* ext2_inode_info->dax_sem
* address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index cf9e430514c4..1afa5a4bcb5f 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -102,9 +102,10 @@ config EXT4_DEBUG
using dynamic debug control for mb_debug() / ext_debug() msgs.
config EXT4_KUNIT_TESTS
- tristate "KUnit tests for ext4"
+ tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS
select EXT4_FS
depends on KUNIT
+ default KUNIT_ALL_TESTS
help
This builds the ext4 KUnit tests.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 4ccb3c9189d8..2e42f47a7f98 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -9,7 +9,8 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
- super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
+ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
+ xattr_user.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index c654205f648d..1d82336b1cd4 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -675,6 +675,7 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
struct qstr qstr = {.name = str, .len = len };
const struct dentry *parent = READ_ONCE(dentry->d_parent);
const struct inode *inode = READ_ONCE(parent->d_inode);
+ char strbuf[DNAME_INLINE_LEN];
if (!inode || !IS_CASEFOLDED(inode) ||
!EXT4_SB(inode->i_sb)->s_encoding) {
@@ -683,6 +684,21 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
return memcmp(str, name->name, len);
}
+ /*
+ * If the dentry name is stored in-line, then it may be concurrently
+ * modified by a rename. If this happens, the VFS will eventually retry
+ * the lookup, so it doesn't matter what ->d_compare() returns.
+ * However, it's unsafe to call utf8_strncasecmp() with an unstable
+ * string. Therefore, we have to copy the name into a temporary buffer.
+ */
+ if (len <= DNAME_INLINE_LEN - 1) {
+ memcpy(strbuf, str, len);
+ strbuf[len] = 0;
+ qstr.name = strbuf;
+ /* prevent compiler from optimizing out the temporary buffer */
+ barrier();
+ }
+
return ext4_ci_compare(inode, name, &qstr, false);
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b08841f70b69..42f5060f3cdf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -426,13 +426,16 @@ struct flex_groups {
#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */
+
+#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */
+
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */
/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
@@ -440,14 +443,16 @@ struct flex_groups {
EXT4_APPEND_FL | \
EXT4_NODUMP_FL | \
EXT4_NOATIME_FL | \
- EXT4_PROJINHERIT_FL)
+ EXT4_PROJINHERIT_FL | \
+ EXT4_DAX_FL)
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
- EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
+ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
+ EXT4_DAX_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
@@ -459,6 +464,10 @@ struct flex_groups {
/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+/* Flags which are mutually exclusive to DAX */
+#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
+ EXT4_JOURNAL_DATA_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -499,6 +508,7 @@ enum {
EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
+ EXT4_INODE_DAX = 25, /* Inode is DAX */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */
@@ -1135,9 +1145,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
-#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
+#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */
#else
-#define EXT4_MOUNT_DAX 0
+#define EXT4_MOUNT_DAX_ALWAYS 0
#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
@@ -1180,6 +1190,8 @@ struct ext4_inode_info {
blocks */
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
+#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */
+#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
specified journal checksum */
@@ -1992,6 +2004,7 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
*/
#define EXT4_FLAGS_RESIZING 0
#define EXT4_FLAGS_SHUTDOWN 1
+#define EXT4_FLAGS_BDEV_IS_DAX 2
static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
{
@@ -2705,7 +2718,7 @@ extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7d088ff1e902..221f240eae60 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2844,7 +2844,7 @@ again:
* in use to avoid freeing it when removing blocks.
*/
if (sbi->s_cluster_ratio > 1) {
- pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+ pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
partial.pclu = EXT4_B2C(sbi, pblk);
partial.state = nofree;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 54d324e80fe5..df25d38d6539 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1116,7 +1116,7 @@ got:
ei->i_block_group = group;
ei->i_last_alloc_group = ~0;
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 40ec5c7ef0d3..10dd470876b3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4403,9 +4403,11 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}
-static bool ext4_should_use_dax(struct inode *inode)
+static bool ext4_should_enable_dax(struct inode *inode)
{
- if (!test_opt(inode->i_sb, DAX))
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (test_opt2(inode->i_sb, DAX_NEVER))
return false;
if (!S_ISREG(inode->i_mode))
return false;
@@ -4417,14 +4419,21 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
return false;
- return true;
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
+ return false;
+ if (test_opt(inode->i_sb, DAX_ALWAYS))
+ return true;
+
+ return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}
-void ext4_set_inode_flags(struct inode *inode)
+void ext4_set_inode_flags(struct inode *inode, bool init)
{
unsigned int flags = EXT4_I(inode)->i_flags;
unsigned int new_fl = 0;
+ WARN_ON_ONCE(IS_DAX(inode) && init);
+
if (flags & EXT4_SYNC_FL)
new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
@@ -4435,8 +4444,13 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (ext4_should_use_dax(inode))
+
+ /* Because of the way inode_set_flags() works we must preserve S_DAX
+ * here if already set. */
+ new_fl |= (inode->i_flags & S_DAX);
+ if (init && ext4_should_enable_dax(inode))
new_fl |= S_DAX;
+
if (flags & EXT4_ENCRYPT_FL)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
@@ -4650,7 +4664,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
if (ext4_has_feature_64bit(sb))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2162db0c747d..999cf6add39c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -292,6 +292,38 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
return 0;
}
+static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (test_opt2(inode->i_sb, DAX_NEVER) ||
+ test_opt(inode->i_sb, DAX_ALWAYS))
+ return;
+
+ if ((ei->i_flags ^ flags) & EXT4_DAX_FL)
+ d_mark_dontcache(inode);
+}
+
+static bool dax_compatible(struct inode *inode, unsigned int oldflags,
+ unsigned int flags)
+{
+ if (flags & EXT4_DAX_FL) {
+ if ((oldflags & EXT4_DAX_MUT_EXCL) ||
+ ext4_test_inode_state(inode,
+ EXT4_STATE_VERITY_IN_PROGRESS)) {
+ return false;
+ }
+ }
+
+ if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL))
+ return false;
+
+ return true;
+}
+
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
{
@@ -300,7 +332,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
int err = -EPERM, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
- unsigned int jflag;
struct super_block *sb = inode->i_sb;
/* Is it quota file? Do not allow user to mess with it */
@@ -309,9 +340,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
oldflags = ei->i_flags;
- /* The JOURNAL_DATA flag is modifiable only by root */
- jflag = flags & EXT4_JOURNAL_DATA_FL;
-
err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
if (err)
goto flags_out;
@@ -320,10 +348,16 @@ static int ext4_ioctl_setflags(struct inode *inode,
* The JOURNAL_DATA flag can only be changed by
* the relevant capability.
*/
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
+
+ if (!dax_compatible(inode, oldflags, flags)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+
if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
@@ -369,6 +403,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
if (err)
goto flags_err;
+ ext4_dax_dontcache(inode, flags);
+
for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
if (!(mask & EXT4_FL_USER_MODIFIABLE))
continue;
@@ -381,7 +417,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_clear_inode_flag(inode, i);
}
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
+
inode->i_ctime = current_time(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
@@ -390,17 +427,18 @@ flags_err:
if (err)
goto flags_out;
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
/*
* Changes to the journaling mode can cause unsafe changes to
- * S_DAX if we are using the DAX mount option.
+ * S_DAX if the inode is DAX
*/
- if (test_opt(inode->i_sb, DAX)) {
+ if (IS_DAX(inode)) {
err = -EBUSY;
goto flags_out;
}
- err = ext4_change_inode_journal_flag(inode, jflag);
+ err = ext4_change_inode_journal_flag(inode,
+ flags & EXT4_JOURNAL_DATA_FL);
if (err)
goto flags_out;
}
@@ -527,12 +565,15 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
xflags |= FS_XFLAG_NOATIME;
if (iflags & EXT4_PROJINHERIT_FL)
xflags |= FS_XFLAG_PROJINHERIT;
+ if (iflags & EXT4_DAX_FL)
+ xflags |= FS_XFLAG_DAX;
return xflags;
}
#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
- FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+ FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \
+ FS_XFLAG_DAX)
/* Transfer xflags flags to internal */
static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
@@ -551,6 +592,8 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
iflags |= EXT4_NOATIME_FL;
if (xflags & FS_XFLAG_PROJINHERIT)
iflags |= EXT4_PROJINHERIT_FL;
+ if (xflags & FS_XFLAG_DAX)
+ iflags |= EXT4_DAX_FL;
return iflags;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a9083113a8c0..c0a331e2feb0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4708,7 +4708,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
- seq = *this_cpu_ptr(&discard_pa_seq);
+ seq = this_cpu_read(discard_pa_seq);
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a29e8ea1a7ab..330957ed1f05 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -93,11 +93,11 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
* i_mmap_rwsem (inode->i_mmap_rwsem)!
*
* page fault path:
- * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
+ * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
* page lock -> i_data_sem (rw)
*
* buffered write path:
- * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> mmap_lock
* sb_start_write -> i_mutex -> transaction start -> page lock ->
* i_data_sem (rw)
*
@@ -107,7 +107,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
* i_data_sem (rw)
*
* direct IO:
- * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> mmap_lock
* sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
*
* writepages:
@@ -522,9 +522,6 @@ static void ext4_handle_error(struct super_block *sb)
smp_wmb();
sb->s_flags |= SB_RDONLY;
} else if (test_opt(sb, ERRORS_PANIC)) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
}
@@ -725,23 +722,20 @@ void __ext4_abort(struct super_block *sb, const char *function,
va_end(args);
if (sb_rdonly(sb) == 0) {
- ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ if (EXT4_SB(sb)->s_journal)
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
* Make sure updated value of ->s_mount_flags will be visible
* before ->s_flags update
*/
smp_wmb();
sb->s_flags |= SB_RDONLY;
- if (EXT4_SB(sb)->s_journal)
- jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
}
- if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
+ if (test_opt(sb, ERRORS_PANIC) && !system_going_down())
panic("EXT4-fs panic from previous error\n");
- }
}
void __ext4_msg(struct super_block *sb,
@@ -1324,6 +1318,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
return -EINVAL;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EOPNOTSUPP;
+
res = ext4_convert_inline_data(inode);
if (res)
return res;
@@ -1349,7 +1346,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
* Update inode->i_flags - S_ENCRYPTED will be enabled,
* S_DAX may be disabled
*/
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
}
return res;
}
@@ -1376,7 +1373,7 @@ retry:
* Update inode->i_flags - S_ENCRYPTED will be enabled,
* S_DAX may be disabled
*/
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
res = ext4_mark_inode_dirty(handle, inode);
if (res)
EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
@@ -1514,7 +1511,8 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
+ Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
Opt_nowarn_on_error, Opt_mblk_io_submit,
Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
@@ -1581,6 +1579,9 @@ static const match_table_t tokens = {
{Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
{Opt_dax, "dax"},
+ {Opt_dax_always, "dax=always"},
+ {Opt_dax_inode, "dax=inode"},
+ {Opt_dax_never, "dax=never"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
{Opt_warn_on_error, "warn_on_error"},
@@ -1729,6 +1730,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
#define MOPT_NO_EXT3 0x0200
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
#define MOPT_STRING 0x0400
+#define MOPT_SKIP 0x0800
static const struct mount_opts {
int token;
@@ -1778,7 +1780,13 @@ static const struct mount_opts {
{Opt_min_batch_time, 0, MOPT_GTE0},
{Opt_inode_readahead_blks, 0, MOPT_GTE0},
{Opt_init_itable, 0, MOPT_GTE0},
- {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
+ {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
+ {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+ {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+ {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
{Opt_stripe, 0, MOPT_GTE0},
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
@@ -2123,13 +2131,56 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
- } else if (token == Opt_dax) {
+ } else if (token == Opt_dax || token == Opt_dax_always ||
+ token == Opt_dax_inode || token == Opt_dax_never) {
#ifdef CONFIG_FS_DAX
- ext4_msg(sb, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- sbi->s_mount_opt |= m->mount_opt;
+ switch (token) {
+ case Opt_dax:
+ case Opt_dax_always:
+ if (is_remount &&
+ (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
+ fail_dax_change_remount:
+ ext4_msg(sb, KERN_ERR, "can't change "
+ "dax mount option while remounting");
+ return -1;
+ }
+ if (is_remount &&
+ (test_opt(sb, DATA_FLAGS) ==
+ EXT4_MOUNT_JOURNAL_DATA)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -1;
+ }
+ ext4_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
+ sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ break;
+ case Opt_dax_never:
+ if (is_remount &&
+ (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
+ goto fail_dax_change_remount;
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+ break;
+ case Opt_dax_inode:
+ if (is_remount &&
+ ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
+ goto fail_dax_change_remount;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+ sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ /* Strictly for printing options */
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
+ break;
+ }
#else
ext4_msg(sb, KERN_INFO, "dax option not supported");
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
return -1;
#endif
} else if (token == Opt_data_err_abort) {
@@ -2293,7 +2344,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
- (m->flags & MOPT_CLEAR_ERR))
+ (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
continue;
if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */
@@ -2353,6 +2404,17 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
fscrypt_show_test_dummy_encryption(seq, sep, sb);
+ if (test_opt(sb, DAX_ALWAYS)) {
+ if (IS_EXT2_SB(sb))
+ SEQ_OPTS_PUTS("dax");
+ else
+ SEQ_OPTS_PUTS("dax=always");
+ } else if (test_opt2(sb, DAX_NEVER)) {
+ SEQ_OPTS_PUTS("dax=never");
+ } else if (test_opt2(sb, DAX_INODE)) {
+ SEQ_OPTS_PUTS("dax=inode");
+ }
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -2383,6 +2445,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
ext4_msg(sb, KERN_ERR, "revision level too high, "
"forcing read-only mode");
err = -EROFS;
+ goto done;
}
if (read_only)
goto done;
@@ -4017,7 +4080,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"both data=journal and delalloc");
goto failed_mount;
}
- if (test_opt(sb, DAX)) {
+ if (test_opt(sb, DAX_ALWAYS)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"both data=journal and dax");
goto failed_mount;
@@ -4127,13 +4190,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (bdev_dax_supported(sb->s_bdev, blocksize))
+ set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+
+ if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
if (ext4_has_feature_inline_data(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
" that may contain inline data");
goto failed_mount;
}
- if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
ext4_msg(sb, KERN_ERR,
"DAX unsupported by block device.");
goto failed_mount;
@@ -5447,12 +5513,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
- if (test_opt(sb, DAX)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- err = -EINVAL;
- goto restore_opts;
- }
} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -5468,12 +5528,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
- ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
- "dax flag with busy inodes while remounting");
- sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
- }
-
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index dec1244dd062..bbd5e7e0632b 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -113,6 +113,9 @@ static int ext4_begin_enable_verity(struct file *filp)
handle_t *handle;
int err;
+ if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EINVAL;
+
if (ext4_verity_in_progress(inode))
return -EBUSY;
@@ -241,7 +244,7 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
if (err)
goto out_stop;
ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
out_stop:
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 9b29a40738ac..7d2f6576d954 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -93,6 +93,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
#ifdef CONFIG_EXT4_FS_SECURITY
[EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
#endif
+ [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler,
};
const struct xattr_handler *ext4_xattr_handlers[] = {
@@ -105,6 +106,7 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
#endif
+ &ext4_xattr_hurd_handler,
NULL
};
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ffe21ac77f78..730b91fa0dd7 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -124,6 +124,7 @@ struct ext4_xattr_inode_array {
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_hurd_handler;
#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c
new file mode 100644
index 000000000000..8cfa74a56361
--- /dev/null
+++ b/fs/ext4/xattr_hurd.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/ext4/xattr_hurd.c
+ * Handler for extended gnu attributes for the Hurd.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, <janneke@gnu.org>
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static bool
+ext4_xattr_hurd_list(struct dentry *dentry)
+{
+ return test_opt(dentry->d_sb, XATTR_USER);
+}
+
+static int
+ext4_xattr_hurd_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD,
+ name, buffer, size);
+}
+
+static int
+ext4_xattr_hurd_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_hurd_handler = {
+ .prefix = XATTR_HURD_PREFIX,
+ .list = ext4_xattr_hurd_list,
+ .get = ext4_xattr_hurd_get,
+ .set = ext4_xattr_hurd_set,
+};
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index bb68d21e1f8c..d13c5c6a9787 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -127,3 +127,13 @@ config F2FS_FS_ZSTD
default y
help
Support ZSTD compress algorithm, if unsure, say Y.
+
+config F2FS_FS_LZORLE
+ bool "LZO-RLE compression support"
+ depends on F2FS_FS_COMPRESSION
+ depends on F2FS_FS_LZO
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ default y
+ help
+ Support LZO-RLE compress algorithm, if unsure, say Y.
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index b96823c59b15..124868c13f80 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/f2fs/acl.h
*
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 852890b72d6a..236064930251 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -86,6 +86,8 @@ repeat:
return ERR_PTR(err);
}
+ f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE);
+
lock_page(page);
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
@@ -220,6 +222,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
.is_por = (type == META_POR),
};
struct blk_plug plug;
+ int err;
if (unlikely(type == META_POR))
fio.op_flags &= ~REQ_META;
@@ -263,8 +266,11 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
}
fio.page = page;
- f2fs_submit_page_bio(&fio);
- f2fs_put_page(page, 0);
+ err = f2fs_submit_page_bio(&fio);
+ f2fs_put_page(page, err ? 1 : 0);
+
+ if (!err)
+ f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE);
}
out:
blk_finish_plug(&plug);
@@ -889,8 +895,8 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
int i;
int err;
- sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks),
- GFP_KERNEL);
+ sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks),
+ GFP_KERNEL);
if (!sbi->ckpt)
return -ENOMEM;
/*
@@ -1160,10 +1166,12 @@ static int block_operations(struct f2fs_sb_info *sbi)
.nr_to_write = LONG_MAX,
.for_reclaim = 0,
};
- struct blk_plug plug;
int err = 0, cnt = 0;
- blk_start_plug(&plug);
+ /*
+ * Let's flush inline_data in dirty node pages.
+ */
+ f2fs_flush_inline_data(sbi);
retry_flush_quotas:
f2fs_lock_all(sbi);
@@ -1192,7 +1200,7 @@ retry_flush_dents:
f2fs_unlock_all(sbi);
err = f2fs_sync_dirty_inodes(sbi, DIR_INODE);
if (err)
- goto out;
+ return err;
cond_resched();
goto retry_flush_quotas;
}
@@ -1208,7 +1216,7 @@ retry_flush_dents:
f2fs_unlock_all(sbi);
err = f2fs_sync_inode_meta(sbi);
if (err)
- goto out;
+ return err;
cond_resched();
goto retry_flush_quotas;
}
@@ -1224,7 +1232,7 @@ retry_flush_nodes:
if (err) {
up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
- goto out;
+ return err;
}
cond_resched();
goto retry_flush_nodes;
@@ -1236,8 +1244,6 @@ retry_flush_nodes:
*/
__prepare_cp_block(sbi);
up_write(&sbi->node_change);
-out:
- blk_finish_plug(&plug);
return err;
}
@@ -1260,6 +1266,9 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
if (unlikely(f2fs_cp_error(sbi)))
break;
+ if (type == F2FS_DIRTY_META)
+ f2fs_sync_meta_pages(sbi, META, LONG_MAX,
+ FS_CP_META_IO);
io_schedule_timeout(DEFAULT_IO_TIMEOUT);
}
finish_wait(&sbi->cp_wait, &wait);
@@ -1553,7 +1562,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
return 0;
f2fs_warn(sbi, "Start checkpoint disabled!");
}
- mutex_lock(&sbi->cp_mutex);
+ if (cpc->reason != CP_RESIZE)
+ mutex_lock(&sbi->cp_mutex);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
@@ -1622,7 +1632,8 @@ stop:
f2fs_update_time(sbi, CP_TIME);
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
- mutex_unlock(&sbi->cp_mutex);
+ if (cpc->reason != CP_RESIZE)
+ mutex_unlock(&sbi->cp_mutex);
return err;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index df7b2d15eacd..1e02a8c106b0 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -65,15 +65,6 @@ static void f2fs_set_compressed_page(struct page *page,
page->mapping = inode->i_mapping;
}
-static void f2fs_put_compressed_page(struct page *page)
-{
- set_page_private(page, (unsigned long)NULL);
- ClearPagePrivate(page);
- page->mapping = NULL;
- unlock_page(page);
- put_page(page);
-}
-
static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock)
{
int i;
@@ -98,8 +89,7 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len)
f2fs_drop_rpages(cc, len, true);
}
-static void f2fs_put_rpages_mapping(struct compress_ctx *cc,
- struct address_space *mapping,
+static void f2fs_put_rpages_mapping(struct address_space *mapping,
pgoff_t start, int len)
{
int i;
@@ -236,7 +226,12 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc)
if (!cc->private)
return -ENOMEM;
- cc->clen = LZ4_compressBound(PAGE_SIZE << cc->log_cluster_size);
+ /*
+ * we do not change cc->clen to LZ4_compressBound(inputsize) to
+ * adapt worst compress case, because lz4 compressor can handle
+ * output budget properly.
+ */
+ cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE;
return 0;
}
@@ -252,11 +247,9 @@ static int lz4_compress_pages(struct compress_ctx *cc)
len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
cc->clen, cc->private);
- if (!len) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 compress failed\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id);
- return -EIO;
- }
+ if (!len)
+ return -EAGAIN;
+
cc->clen = len;
return 0;
}
@@ -366,6 +359,13 @@ static int zstd_compress_pages(struct compress_ctx *cc)
return -EIO;
}
+ /*
+ * there is compressed data remained in intermediate buffer due to
+ * no more space in cbuf.cdata
+ */
+ if (ret)
+ return -EAGAIN;
+
cc->clen = outbuf.pos;
return 0;
}
@@ -451,6 +451,31 @@ static const struct f2fs_compress_ops f2fs_zstd_ops = {
};
#endif
+#ifdef CONFIG_F2FS_FS_LZO
+#ifdef CONFIG_F2FS_FS_LZORLE
+static int lzorle_compress_pages(struct compress_ctx *cc)
+{
+ int ret;
+
+ ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
+ &cc->clen, cc->private);
+ if (ret != LZO_E_OK) {
+ printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret);
+ return -EIO;
+ }
+ return 0;
+}
+
+static const struct f2fs_compress_ops f2fs_lzorle_ops = {
+ .init_compress_ctx = lzo_init_compress_ctx,
+ .destroy_compress_ctx = lzo_destroy_compress_ctx,
+ .compress_pages = lzorle_compress_pages,
+ .decompress_pages = lzo_decompress_pages,
+};
+#endif
+#endif
+
static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
#ifdef CONFIG_F2FS_FS_LZO
&f2fs_lzo_ops,
@@ -467,6 +492,11 @@ static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
#else
NULL,
#endif
+#if defined(CONFIG_F2FS_FS_LZO) && defined(CONFIG_F2FS_FS_LZORLE)
+ &f2fs_lzorle_ops,
+#else
+ NULL,
+#endif
};
bool f2fs_is_compress_backend_ready(struct inode *inode)
@@ -476,17 +506,47 @@ bool f2fs_is_compress_backend_ready(struct inode *inode)
return f2fs_cops[F2FS_I(inode)->i_compress_algorithm];
}
-static struct page *f2fs_grab_page(void)
+static mempool_t *compress_page_pool = NULL;
+static int num_compress_pages = 512;
+module_param(num_compress_pages, uint, 0444);
+MODULE_PARM_DESC(num_compress_pages,
+ "Number of intermediate compress pages to preallocate");
+
+int f2fs_init_compress_mempool(void)
+{
+ compress_page_pool = mempool_create_page_pool(num_compress_pages, 0);
+ if (!compress_page_pool)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void f2fs_destroy_compress_mempool(void)
+{
+ mempool_destroy(compress_page_pool);
+}
+
+static struct page *f2fs_compress_alloc_page(void)
{
struct page *page;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return NULL;
+ page = mempool_alloc(compress_page_pool, GFP_NOFS);
lock_page(page);
+
return page;
}
+static void f2fs_compress_free_page(struct page *page)
+{
+ if (!page)
+ return;
+ set_page_private(page, (unsigned long)NULL);
+ ClearPagePrivate(page);
+ page->mapping = NULL;
+ unlock_page(page);
+ mempool_free(page, compress_page_pool);
+}
+
static int f2fs_compress_pages(struct compress_ctx *cc)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
@@ -516,7 +576,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
}
for (i = 0; i < cc->nr_cpages; i++) {
- cc->cpages[i] = f2fs_grab_page();
+ cc->cpages[i] = f2fs_compress_alloc_page();
if (!cc->cpages[i]) {
ret = -ENOMEM;
goto out_free_cpages;
@@ -561,7 +621,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
vunmap(cc->rbuf);
for (i = nr_cpages; i < cc->nr_cpages; i++) {
- f2fs_put_compressed_page(cc->cpages[i]);
+ f2fs_compress_free_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
@@ -581,7 +641,7 @@ out_vunmap_rbuf:
out_free_cpages:
for (i = 0; i < cc->nr_cpages; i++) {
if (cc->cpages[i])
- f2fs_put_compressed_page(cc->cpages[i]);
+ f2fs_compress_free_page(cc->cpages[i]);
}
kfree(cc->cpages);
cc->cpages = NULL;
@@ -788,6 +848,8 @@ static bool cluster_may_compress(struct compress_ctx *cc)
return false;
if (!f2fs_cluster_is_full(cc))
return false;
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode))))
+ return false;
return __cluster_may_compress(cc);
}
@@ -879,7 +941,7 @@ retry:
if (!PageUptodate(page)) {
f2fs_unlock_rpages(cc, i + 1);
- f2fs_put_rpages_mapping(cc, mapping, start_idx,
+ f2fs_put_rpages_mapping(mapping, start_idx,
cc->cluster_size);
f2fs_destroy_compress_ctx(cc);
goto retry;
@@ -914,7 +976,7 @@ retry:
unlock_pages:
f2fs_unlock_rpages(cc, i);
release_pages:
- f2fs_put_rpages_mapping(cc, mapping, start_idx, i);
+ f2fs_put_rpages_mapping(mapping, start_idx, i);
f2fs_destroy_compress_ctx(cc);
return ret;
}
@@ -954,6 +1016,55 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
return first_index;
}
+int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock)
+{
+ void *fsdata = NULL;
+ struct page *pagep;
+ int log_cluster_size = F2FS_I(inode)->i_log_cluster_size;
+ pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) <<
+ log_cluster_size;
+ int err;
+
+ err = f2fs_is_compressed_cluster(inode, start_idx);
+ if (err < 0)
+ return err;
+
+ /* truncate normal cluster */
+ if (!err)
+ return f2fs_do_truncate_blocks(inode, from, lock);
+
+ /* truncate compressed cluster */
+ err = f2fs_prepare_compress_overwrite(inode, &pagep,
+ start_idx, &fsdata);
+
+ /* should not be a normal cluster */
+ f2fs_bug_on(F2FS_I_SB(inode), err == 0);
+
+ if (err <= 0)
+ return err;
+
+ if (err > 0) {
+ struct page **rpages = fsdata;
+ int cluster_size = F2FS_I(inode)->i_cluster_size;
+ int i;
+
+ for (i = cluster_size - 1; i >= 0; i--) {
+ loff_t start = rpages[i]->index << PAGE_SHIFT;
+
+ if (from <= start) {
+ zero_user_segment(rpages[i], 0, PAGE_SIZE);
+ } else {
+ zero_user_segment(rpages[i], from - start,
+ PAGE_SIZE);
+ break;
+ }
+ }
+
+ f2fs_compress_write_end(inode, fsdata, start_idx, true);
+ }
+ return 0;
+}
+
static int f2fs_write_compressed_pages(struct compress_ctx *cc,
int *submitted,
struct writeback_control *wbc,
@@ -985,7 +1096,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
loff_t psize;
int i, err;
- if (!f2fs_trylock_op(sbi))
+ if (!IS_NOQUOTA(inode) && !f2fs_trylock_op(sbi))
return -EAGAIN;
set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
@@ -1092,7 +1203,8 @@ unlock_continue:
set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
+ if (!IS_NOQUOTA(inode))
+ f2fs_unlock_op(sbi);
spin_lock(&fi->i_size_lock);
if (fi->last_disk_size < psize)
@@ -1118,7 +1230,8 @@ out_put_cic:
out_put_dnode:
f2fs_put_dnode(&dn);
out_unlock_op:
- f2fs_unlock_op(sbi);
+ if (!IS_NOQUOTA(inode))
+ f2fs_unlock_op(sbi);
return -EAGAIN;
}
@@ -1132,7 +1245,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
if (unlikely(bio->bi_status))
mapping_set_error(cic->inode->i_mapping, -EIO);
- f2fs_put_compressed_page(page);
+ f2fs_compress_free_page(page);
dec_page_count(sbi, F2FS_WB_DATA);
@@ -1293,7 +1406,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
for (i = 0; i < dic->nr_cpages; i++) {
struct page *page;
- page = f2fs_grab_page();
+ page = f2fs_compress_alloc_page();
if (!page)
goto out_free;
@@ -1313,7 +1426,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
continue;
}
- dic->tpages[i] = f2fs_grab_page();
+ dic->tpages[i] = f2fs_compress_alloc_page();
if (!dic->tpages[i])
goto out_free;
}
@@ -1335,8 +1448,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
continue;
if (!dic->tpages[i])
continue;
- unlock_page(dic->tpages[i]);
- put_page(dic->tpages[i]);
+ f2fs_compress_free_page(dic->tpages[i]);
}
kfree(dic->tpages);
}
@@ -1345,7 +1457,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
for (i = 0; i < dic->nr_cpages; i++) {
if (!dic->cpages[i])
continue;
- f2fs_put_compressed_page(dic->cpages[i]);
+ f2fs_compress_free_page(dic->cpages[i]);
}
kfree(dic->cpages);
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 072fb19555a8..326c63879ddc 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -115,7 +115,8 @@ static enum count_type __read_io_type(struct page *page)
/* postprocessing steps for read bios */
enum bio_post_read_step {
STEP_DECRYPT,
- STEP_DECOMPRESS,
+ STEP_DECOMPRESS_NOWQ, /* handle normal cluster data inplace */
+ STEP_DECOMPRESS, /* handle compressed cluster data in workqueue */
STEP_VERITY,
};
@@ -514,6 +515,34 @@ void f2fs_submit_bio(struct f2fs_sb_info *sbi,
__submit_bio(sbi, bio, type);
}
+static void __attach_io_flag(struct f2fs_io_info *fio)
+{
+ struct f2fs_sb_info *sbi = fio->sbi;
+ unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
+ unsigned int io_flag, fua_flag, meta_flag;
+
+ if (fio->type == DATA)
+ io_flag = sbi->data_io_flag;
+ else if (fio->type == NODE)
+ io_flag = sbi->node_io_flag;
+ else
+ return;
+
+ fua_flag = io_flag & temp_mask;
+ meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
+
+ /*
+ * data/node io flag bits per temp:
+ * REQ_META | REQ_FUA |
+ * 5 | 4 | 3 | 2 | 1 | 0 |
+ * Cold | Warm | Hot | Cold | Warm | Hot |
+ */
+ if ((1 << fio->temp) & meta_flag)
+ fio->op_flags |= REQ_META;
+ if ((1 << fio->temp) & fua_flag)
+ fio->op_flags |= REQ_FUA;
+}
+
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -521,6 +550,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
if (!io->bio)
return;
+ __attach_io_flag(fio);
bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
if (is_read_io(fio->op))
@@ -662,6 +692,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (fio->io_wbc && !is_read_io(fio->op))
wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
+ __attach_io_flag(fio);
bio_set_op_attrs(bio, fio->op, fio->op_flags);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
@@ -848,6 +879,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_PAGES);
+ __attach_io_flag(fio);
bio_set_op_attrs(bio, fio->op, fio->op_flags);
add_bio_entry(fio->sbi, bio, page, fio->temp);
@@ -968,7 +1000,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
if (f2fs_encrypted_file(inode))
post_read_steps |= 1 << STEP_DECRYPT;
if (f2fs_compressed_file(inode))
- post_read_steps |= 1 << STEP_DECOMPRESS;
+ post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ;
if (f2fs_need_verity(inode, first_idx))
post_read_steps |= 1 << STEP_VERITY;
@@ -1011,6 +1043,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
}
ClearPageError(page);
inc_page_count(sbi, F2FS_RD_DATA);
+ f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
__submit_bio(sbi, bio, DATA);
return 0;
}
@@ -1809,6 +1842,25 @@ static int f2fs_xattr_fiemap(struct inode *inode,
return (err < 0 ? err : 0);
}
+static loff_t max_inode_blocks(struct inode *inode)
+{
+ loff_t result = ADDRS_PER_INODE(inode);
+ loff_t leaf_count = ADDRS_PER_BLOCK(inode);
+
+ /* two direct node blocks */
+ result += (leaf_count * 2);
+
+ /* two indirect node blocks */
+ leaf_count *= NIDS_PER_BLOCK;
+ result += (leaf_count * 2);
+
+ /* one double indirect node block */
+ leaf_count *= NIDS_PER_BLOCK;
+ result += leaf_count;
+
+ return result;
+}
+
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -1818,6 +1870,8 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
int ret = 0;
+ bool compr_cluster = false;
+ unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
ret = f2fs_precache_extents(inode);
@@ -1852,6 +1906,9 @@ next:
memset(&map_bh, 0, sizeof(struct buffer_head));
map_bh.b_size = len;
+ if (compr_cluster)
+ map_bh.b_size = blk_to_logical(inode, cluster_size - 1);
+
ret = get_data_block(inode, start_blk, &map_bh, 0,
F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
if (ret)
@@ -1862,7 +1919,7 @@ next:
start_blk = next_pgofs;
if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
- F2FS_I_SB(inode)->max_file_blocks))
+ max_inode_blocks(inode)))
goto prep_next;
flags |= FIEMAP_EXTENT_LAST;
@@ -1874,11 +1931,38 @@ next:
ret = fiemap_fill_next_extent(fieinfo, logical,
phys, size, flags);
+ if (ret)
+ goto out;
+ size = 0;
}
- if (start_blk > last_blk || ret)
+ if (start_blk > last_blk)
goto out;
+ if (compr_cluster) {
+ compr_cluster = false;
+
+
+ logical = blk_to_logical(inode, start_blk - 1);
+ phys = blk_to_logical(inode, map_bh.b_blocknr);
+ size = blk_to_logical(inode, cluster_size);
+
+ flags |= FIEMAP_EXTENT_ENCODED;
+
+ start_blk += cluster_size - 1;
+
+ if (start_blk > last_blk)
+ goto out;
+
+ goto prep_next;
+ }
+
+ if (map_bh.b_blocknr == COMPRESS_ADDR) {
+ compr_cluster = true;
+ start_blk++;
+ goto prep_next;
+ }
+
logical = blk_to_logical(inode, start_blk);
phys = blk_to_logical(inode, map_bh.b_blocknr);
size = map_bh.b_size;
@@ -2016,6 +2100,7 @@ submit_and_realloc:
goto submit_and_realloc;
inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ f2fs_update_iostat(F2FS_I_SB(inode), FS_DATA_READ_IO, F2FS_BLKSIZE);
ClearPageError(page);
*last_block_in_bio = block_nr;
goto out;
@@ -2114,6 +2199,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
for (i = 0; i < dic->nr_cpages; i++) {
struct page *page = dic->cpages[i];
block_t blkaddr;
+ struct bio_post_read_ctx *ctx;
blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i + 1);
@@ -2131,16 +2217,16 @@ submit_and_realloc:
page->index, for_write);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
- bio = NULL;
dic->failed = true;
if (refcount_sub_and_test(dic->nr_cpages - i,
- &dic->ref))
+ &dic->ref)) {
f2fs_decompress_end_io(dic->rpages,
cc->cluster_size, true,
false);
- f2fs_free_dic(dic);
+ f2fs_free_dic(dic);
+ }
f2fs_put_dnode(&dn);
- *bio_ret = bio;
+ *bio_ret = NULL;
return ret;
}
}
@@ -2150,7 +2236,14 @@ submit_and_realloc:
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
goto submit_and_realloc;
+ /* tag STEP_DECOMPRESS to handle IO in wq */
+ ctx = bio->bi_private;
+ if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS)))
+ ctx->enabled_steps |= 1 << STEP_DECOMPRESS;
+
inc_page_count(sbi, F2FS_RD_DATA);
+ f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, FS_CDATA_READ_IO, F2FS_BLKSIZE);
ClearPageError(page);
*last_block_in_bio = blkaddr;
}
@@ -2624,8 +2717,8 @@ write:
f2fs_available_free_memory(sbi, BASE_CHECK))))
goto redirty_out;
- /* Dentry blocks are controlled by checkpoint */
- if (S_ISDIR(inode->i_mode)) {
+ /* Dentry/quota blocks are controlled by checkpoint */
+ if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) {
fio.need_lock = LOCK_DONE;
err = f2fs_do_write_data_page(&fio);
goto done;
@@ -2767,7 +2860,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- int cycled;
int range_whole = 0;
xa_mark_t tag;
int nwritten = 0;
@@ -2785,17 +2877,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
- if (index == 0)
- cycled = 1;
- else
- cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- cycled = 1; /* ignore range_cyclic tests */
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
@@ -2960,12 +3047,13 @@ next:
}
}
#endif
- if ((!cycled && !done) || retry) {
- cycled = 1;
+ if (retry) {
index = 0;
- end = writeback_index - 1;
+ end = -1;
goto retry;
}
+ if (wbc->range_cyclic && !done)
+ done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
@@ -3494,6 +3582,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else if (err < 0) {
f2fs_write_failed(mapping, offset + count);
}
+ } else {
+ if (err > 0)
+ f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err);
}
out:
@@ -3577,6 +3668,37 @@ static int f2fs_set_data_page_dirty(struct page *page)
return 0;
}
+
+static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block)
+{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ struct dnode_of_data dn;
+ sector_t start_idx, blknr = 0;
+ int ret;
+
+ start_idx = round_down(block, F2FS_I(inode)->i_cluster_size);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
+ if (ret)
+ return 0;
+
+ if (dn.data_blkaddr != COMPRESS_ADDR) {
+ dn.ofs_in_node += block - start_idx;
+ blknr = f2fs_data_blkaddr(&dn);
+ if (!__is_valid_data_blkaddr(blknr))
+ blknr = 0;
+ }
+
+ f2fs_put_dnode(&dn);
+
+ return blknr;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+
static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
@@ -3588,6 +3710,9 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
filemap_write_and_wait(mapping);
+ if (f2fs_compressed_file(inode))
+ return f2fs_bmap_compress(inode, block);
+
return generic_block_bmap(mapping, block, get_data_block_bmap);
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 44bfc464df78..d35976785e8c 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -70,6 +70,111 @@ unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de)
return DT_UNKNOWN;
}
+/* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */
+int f2fs_init_casefolded_name(const struct inode *dir,
+ struct f2fs_filename *fname)
+{
+#ifdef CONFIG_UNICODE
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+
+ if (IS_CASEFOLDED(dir)) {
+ fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN,
+ GFP_NOFS);
+ if (!fname->cf_name.name)
+ return -ENOMEM;
+ fname->cf_name.len = utf8_casefold(sbi->s_encoding,
+ fname->usr_fname,
+ fname->cf_name.name,
+ F2FS_NAME_LEN);
+ if ((int)fname->cf_name.len <= 0) {
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+ if (f2fs_has_strict_mode(sbi))
+ return -EINVAL;
+ /* fall back to treating name as opaque byte sequence */
+ }
+ }
+#endif
+ return 0;
+}
+
+static int __f2fs_setup_filename(const struct inode *dir,
+ const struct fscrypt_name *crypt_name,
+ struct f2fs_filename *fname)
+{
+ int err;
+
+ memset(fname, 0, sizeof(*fname));
+
+ fname->usr_fname = crypt_name->usr_fname;
+ fname->disk_name = crypt_name->disk_name;
+#ifdef CONFIG_FS_ENCRYPTION
+ fname->crypto_buf = crypt_name->crypto_buf;
+#endif
+ if (crypt_name->is_ciphertext_name) {
+ /* hash was decoded from the no-key name */
+ fname->hash = cpu_to_le32(crypt_name->hash);
+ } else {
+ err = f2fs_init_casefolded_name(dir, fname);
+ if (err) {
+ f2fs_free_filename(fname);
+ return err;
+ }
+ f2fs_hash_filename(dir, fname);
+ }
+ return 0;
+}
+
+/*
+ * Prepare to search for @iname in @dir. This is similar to
+ * fscrypt_setup_filename(), but this also handles computing the casefolded name
+ * and the f2fs dirhash if needed, then packing all the information about this
+ * filename up into a 'struct f2fs_filename'.
+ */
+int f2fs_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct f2fs_filename *fname)
+{
+ struct fscrypt_name crypt_name;
+ int err;
+
+ err = fscrypt_setup_filename(dir, iname, lookup, &crypt_name);
+ if (err)
+ return err;
+
+ return __f2fs_setup_filename(dir, &crypt_name, fname);
+}
+
+/*
+ * Prepare to look up @dentry in @dir. This is similar to
+ * fscrypt_prepare_lookup(), but this also handles computing the casefolded name
+ * and the f2fs dirhash if needed, then packing all the information about this
+ * filename up into a 'struct f2fs_filename'.
+ */
+int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct f2fs_filename *fname)
+{
+ struct fscrypt_name crypt_name;
+ int err;
+
+ err = fscrypt_prepare_lookup(dir, dentry, &crypt_name);
+ if (err)
+ return err;
+
+ return __f2fs_setup_filename(dir, &crypt_name, fname);
+}
+
+void f2fs_free_filename(struct f2fs_filename *fname)
+{
+#ifdef CONFIG_FS_ENCRYPTION
+ kfree(fname->crypto_buf.name);
+ fname->crypto_buf.name = NULL;
+#endif
+#ifdef CONFIG_UNICODE
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+#endif
+}
+
static unsigned long dir_block_index(unsigned int level,
int dir_level, unsigned int idx)
{
@@ -84,8 +189,7 @@ static unsigned long dir_block_index(unsigned int level,
static struct f2fs_dir_entry *find_in_block(struct inode *dir,
struct page *dentry_page,
- struct fscrypt_name *fname,
- f2fs_hash_t namehash,
+ const struct f2fs_filename *fname,
int *max_slots,
struct page **res_page)
{
@@ -96,7 +200,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
make_dentry_ptr_block(dir, &d, dentry_blk);
- de = f2fs_find_target_dentry(fname, namehash, max_slots, &d);
+ de = f2fs_find_target_dentry(&d, fname, max_slots);
if (de)
*res_page = dentry_page;
@@ -107,112 +211,57 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
/*
* Test whether a case-insensitive directory entry matches the filename
* being searched for.
- *
- * Returns: 0 if the directory entry matches, more than 0 if it
- * doesn't match or less than zero on error.
*/
-int f2fs_ci_compare(const struct inode *parent, const struct qstr *name,
- const struct qstr *entry, bool quick)
+static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
+ const u8 *de_name, u32 de_name_len)
{
- const struct f2fs_sb_info *sbi = F2FS_SB(parent->i_sb);
+ const struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
const struct unicode_map *um = sbi->s_encoding;
- int ret;
-
- if (quick)
- ret = utf8_strncasecmp_folded(um, name, entry);
- else
- ret = utf8_strncasecmp(um, name, entry);
+ struct qstr entry = QSTR_INIT(de_name, de_name_len);
+ int res;
- if (ret < 0) {
- /* Handle invalid character sequence as either an error
- * or as an opaque byte sequence.
+ res = utf8_strncasecmp_folded(um, name, &entry);
+ if (res < 0) {
+ /*
+ * In strict mode, ignore invalid names. In non-strict mode,
+ * fall back to treating them as opaque byte sequences.
*/
- if (f2fs_has_strict_mode(sbi))
- return -EINVAL;
-
- if (name->len != entry->len)
- return 1;
-
- return !!memcmp(name->name, entry->name, name->len);
+ if (f2fs_has_strict_mode(sbi) || name->len != entry.len)
+ return false;
+ return !memcmp(name->name, entry.name, name->len);
}
-
- return ret;
+ return res == 0;
}
+#endif /* CONFIG_UNICODE */
-static void f2fs_fname_setup_ci_filename(struct inode *dir,
- const struct qstr *iname,
- struct fscrypt_str *cf_name)
+static inline bool f2fs_match_name(const struct inode *dir,
+ const struct f2fs_filename *fname,
+ const u8 *de_name, u32 de_name_len)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
-
- if (!IS_CASEFOLDED(dir)) {
- cf_name->name = NULL;
- return;
- }
+ struct fscrypt_name f;
- cf_name->name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, GFP_NOFS);
- if (!cf_name->name)
- return;
-
- cf_name->len = utf8_casefold(sbi->s_encoding,
- iname, cf_name->name,
- F2FS_NAME_LEN);
- if ((int)cf_name->len <= 0) {
- kvfree(cf_name->name);
- cf_name->name = NULL;
- }
-}
-#endif
-
-static inline bool f2fs_match_name(struct f2fs_dentry_ptr *d,
- struct f2fs_dir_entry *de,
- struct fscrypt_name *fname,
- struct fscrypt_str *cf_str,
- unsigned long bit_pos,
- f2fs_hash_t namehash)
-{
#ifdef CONFIG_UNICODE
- struct inode *parent = d->inode;
- struct f2fs_sb_info *sbi = F2FS_I_SB(parent);
- struct qstr entry;
-#endif
-
- if (de->hash_code != namehash)
- return false;
+ if (fname->cf_name.name) {
+ struct qstr cf = FSTR_TO_QSTR(&fname->cf_name);
-#ifdef CONFIG_UNICODE
- entry.name = d->filename[bit_pos];
- entry.len = de->name_len;
-
- if (sbi->s_encoding && IS_CASEFOLDED(parent)) {
- if (cf_str->name) {
- struct qstr cf = {.name = cf_str->name,
- .len = cf_str->len};
- return !f2fs_ci_compare(parent, &cf, &entry, true);
- }
- return !f2fs_ci_compare(parent, fname->usr_fname, &entry,
- false);
+ return f2fs_match_ci_name(dir, &cf, de_name, de_name_len);
}
#endif
- if (fscrypt_match_name(fname, d->filename[bit_pos],
- le16_to_cpu(de->name_len)))
- return true;
- return false;
+ f.usr_fname = fname->usr_fname;
+ f.disk_name = fname->disk_name;
+#ifdef CONFIG_FS_ENCRYPTION
+ f.crypto_buf = fname->crypto_buf;
+#endif
+ return fscrypt_match_name(&f, de_name, de_name_len);
}
-struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname,
- f2fs_hash_t namehash, int *max_slots,
- struct f2fs_dentry_ptr *d)
+struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
+ const struct f2fs_filename *fname, int *max_slots)
{
struct f2fs_dir_entry *de;
- struct fscrypt_str cf_str = { .name = NULL, .len = 0 };
unsigned long bit_pos = 0;
int max_len = 0;
-#ifdef CONFIG_UNICODE
- f2fs_fname_setup_ci_filename(d->inode, fname->usr_fname, &cf_str);
-#endif
-
if (max_slots)
*max_slots = 0;
while (bit_pos < d->max) {
@@ -229,7 +278,9 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname,
continue;
}
- if (f2fs_match_name(d, de, fname, &cf_str, bit_pos, namehash))
+ if (de->hash_code == fname->hash &&
+ f2fs_match_name(d->inode, fname, d->filename[bit_pos],
+ le16_to_cpu(de->name_len)))
goto found;
if (max_slots && max_len > *max_slots)
@@ -243,33 +294,27 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname,
found:
if (max_slots && max_len > *max_slots)
*max_slots = max_len;
-
-#ifdef CONFIG_UNICODE
- kvfree(cf_str.name);
-#endif
return de;
}
static struct f2fs_dir_entry *find_in_level(struct inode *dir,
unsigned int level,
- struct fscrypt_name *fname,
+ const struct f2fs_filename *fname,
struct page **res_page)
{
- struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
- int s = GET_DENTRY_SLOTS(name.len);
+ int s = GET_DENTRY_SLOTS(fname->disk_name.len);
unsigned int nbucket, nblock;
unsigned int bidx, end_block;
struct page *dentry_page;
struct f2fs_dir_entry *de = NULL;
bool room = false;
int max_slots;
- f2fs_hash_t namehash = f2fs_dentry_hash(dir, &name, fname);
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
- le32_to_cpu(namehash) % nbucket);
+ le32_to_cpu(fname->hash) % nbucket);
end_block = bidx + nblock;
for (; bidx < end_block; bidx++) {
@@ -285,8 +330,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
}
}
- de = find_in_block(dir, dentry_page, fname, namehash,
- &max_slots, res_page);
+ de = find_in_block(dir, dentry_page, fname, &max_slots,
+ res_page);
if (de)
break;
@@ -295,8 +340,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
f2fs_put_page(dentry_page, 0);
}
- if (!de && room && F2FS_I(dir)->chash != namehash) {
- F2FS_I(dir)->chash = namehash;
+ if (!de && room && F2FS_I(dir)->chash != fname->hash) {
+ F2FS_I(dir)->chash = fname->hash;
F2FS_I(dir)->clevel = level;
}
@@ -304,7 +349,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
}
struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
- struct fscrypt_name *fname, struct page **res_page)
+ const struct f2fs_filename *fname,
+ struct page **res_page)
{
unsigned long npages = dir_blocks(dir);
struct f2fs_dir_entry *de = NULL;
@@ -353,18 +399,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
const struct qstr *child, struct page **res_page)
{
struct f2fs_dir_entry *de = NULL;
- struct fscrypt_name fname;
+ struct f2fs_filename fname;
int err;
-#ifdef CONFIG_UNICODE
- if (f2fs_has_strict_mode(F2FS_I_SB(dir)) && IS_CASEFOLDED(dir) &&
- utf8_validate(F2FS_I_SB(dir)->s_encoding, child)) {
- *res_page = ERR_PTR(-EINVAL);
- return NULL;
- }
-#endif
-
- err = fscrypt_setup_filename(dir, child, 1, &fname);
+ err = f2fs_setup_filename(dir, child, 1, &fname);
if (err) {
if (err == -ENOENT)
*res_page = NULL;
@@ -375,7 +413,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
de = __f2fs_find_entry(dir, &fname, res_page);
- fscrypt_free_filename(&fname);
+ f2fs_free_filename(&fname);
return de;
}
@@ -416,7 +454,8 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
f2fs_put_page(page, 1);
}
-static void init_dent_inode(const struct qstr *name, struct page *ipage)
+static void init_dent_inode(const struct f2fs_filename *fname,
+ struct page *ipage)
{
struct f2fs_inode *ri;
@@ -424,16 +463,16 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
/* copy name info. to this inode page */
ri = F2FS_INODE(ipage);
- ri->i_namelen = cpu_to_le32(name->len);
- memcpy(ri->i_name, name->name, name->len);
+ ri->i_namelen = cpu_to_le32(fname->disk_name.len);
+ memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len);
set_page_dirty(ipage);
}
void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent,
struct f2fs_dentry_ptr *d)
{
- struct qstr dot = QSTR_INIT(".", 1);
- struct qstr dotdot = QSTR_INIT("..", 2);
+ struct fscrypt_str dot = FSTR_INIT(".", 1);
+ struct fscrypt_str dotdot = FSTR_INIT("..", 2);
/* update dirent of "." */
f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0);
@@ -467,8 +506,7 @@ static int make_empty_dir(struct inode *inode,
}
struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
- const struct qstr *new_name, const struct qstr *orig_name,
- struct page *dpage)
+ const struct f2fs_filename *fname, struct page *dpage)
{
struct page *page;
int err;
@@ -493,7 +531,8 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- err = f2fs_init_security(inode, dir, orig_name, page);
+ err = f2fs_init_security(inode, dir,
+ fname ? fname->usr_fname : NULL, page);
if (err)
goto put_error;
@@ -508,8 +547,8 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
return page;
}
- if (new_name) {
- init_dent_inode(new_name, page);
+ if (fname) {
+ init_dent_inode(fname, page);
if (IS_ENCRYPTED(dir))
file_set_enc_name(inode);
}
@@ -577,11 +616,11 @@ next:
}
bool f2fs_has_enough_room(struct inode *dir, struct page *ipage,
- struct fscrypt_name *fname)
+ const struct f2fs_filename *fname)
{
struct f2fs_dentry_ptr d;
unsigned int bit_pos;
- int slots = GET_DENTRY_SLOTS(fname_len(fname));
+ int slots = GET_DENTRY_SLOTS(fname->disk_name.len);
make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage));
@@ -591,8 +630,8 @@ bool f2fs_has_enough_room(struct inode *dir, struct page *ipage,
}
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
- const struct qstr *name, f2fs_hash_t name_hash,
- unsigned int bit_pos)
+ const struct fscrypt_str *name, f2fs_hash_t name_hash,
+ unsigned int bit_pos)
{
struct f2fs_dir_entry *de;
int slots = GET_DENTRY_SLOTS(name->len);
@@ -612,15 +651,13 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
}
}
-int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
- const struct qstr *orig_name,
- struct inode *inode, nid_t ino, umode_t mode)
+int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname,
+ struct inode *inode, nid_t ino, umode_t mode)
{
unsigned int bit_pos;
unsigned int level;
unsigned int current_depth;
unsigned long bidx, block;
- f2fs_hash_t dentry_hash;
unsigned int nbucket, nblock;
struct page *dentry_page = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
@@ -629,11 +666,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
int slots, err = 0;
level = 0;
- slots = GET_DENTRY_SLOTS(new_name->len);
- dentry_hash = f2fs_dentry_hash(dir, new_name, NULL);
+ slots = GET_DENTRY_SLOTS(fname->disk_name.len);
current_depth = F2FS_I(dir)->i_current_depth;
- if (F2FS_I(dir)->chash == dentry_hash) {
+ if (F2FS_I(dir)->chash == fname->hash) {
level = F2FS_I(dir)->clevel;
F2FS_I(dir)->chash = 0;
}
@@ -655,7 +691,7 @@ start:
nblock = bucket_blocks(level);
bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
- (le32_to_cpu(dentry_hash) % nbucket));
+ (le32_to_cpu(fname->hash) % nbucket));
for (block = bidx; block <= (bidx + nblock - 1); block++) {
dentry_page = f2fs_get_new_data_page(dir, NULL, block, true);
@@ -679,8 +715,7 @@ add_dentry:
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
- page = f2fs_init_inode_metadata(inode, dir, new_name,
- orig_name, NULL);
+ page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -688,7 +723,8 @@ add_dentry:
}
make_dentry_ptr_block(NULL, &d, dentry_blk);
- f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
+ f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash,
+ bit_pos);
set_page_dirty(dentry_page);
@@ -712,21 +748,15 @@ fail:
return err;
}
-int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname,
- struct inode *inode, nid_t ino, umode_t mode)
+int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname,
+ struct inode *inode, nid_t ino, umode_t mode)
{
- struct qstr new_name;
int err = -EAGAIN;
- new_name.name = fname_name(fname);
- new_name.len = fname_len(fname);
-
if (f2fs_has_inline_dentry(dir))
- err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname,
- inode, ino, mode);
+ err = f2fs_add_inline_entry(dir, fname, inode, ino, mode);
if (err == -EAGAIN)
- err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname,
- inode, ino, mode);
+ err = f2fs_add_regular_entry(dir, fname, inode, ino, mode);
f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
return err;
@@ -739,12 +769,12 @@ int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname,
int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode)
{
- struct fscrypt_name fname;
+ struct f2fs_filename fname;
struct page *page = NULL;
struct f2fs_dir_entry *de = NULL;
int err;
- err = fscrypt_setup_filename(dir, name, 0, &fname);
+ err = f2fs_setup_filename(dir, name, 0, &fname);
if (err)
return err;
@@ -767,7 +797,7 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
} else {
err = f2fs_add_dentry(dir, &fname, inode, ino, mode);
}
- fscrypt_free_filename(&fname);
+ f2fs_free_filename(&fname);
return err;
}
@@ -777,7 +807,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
int err = 0;
down_write(&F2FS_I(inode)->i_sem);
- page = f2fs_init_inode_metadata(inode, dir, NULL, NULL, NULL);
+ page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -1080,17 +1110,41 @@ const struct file_operations f2fs_dir_operations = {
static int f2fs_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
- struct qstr qstr = {.name = str, .len = len };
const struct dentry *parent = READ_ONCE(dentry->d_parent);
- const struct inode *inode = READ_ONCE(parent->d_inode);
+ const struct inode *dir = READ_ONCE(parent->d_inode);
+ const struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ struct qstr entry = QSTR_INIT(str, len);
+ char strbuf[DNAME_INLINE_LEN];
+ int res;
+
+ if (!dir || !IS_CASEFOLDED(dir))
+ goto fallback;
- if (!inode || !IS_CASEFOLDED(inode)) {
- if (len != name->len)
- return -1;
- return memcmp(str, name->name, len);
+ /*
+ * If the dentry name is stored in-line, then it may be concurrently
+ * modified by a rename. If this happens, the VFS will eventually retry
+ * the lookup, so it doesn't matter what ->d_compare() returns.
+ * However, it's unsafe to call utf8_strncasecmp() with an unstable
+ * string. Therefore, we have to copy the name into a temporary buffer.
+ */
+ if (len <= DNAME_INLINE_LEN - 1) {
+ memcpy(strbuf, str, len);
+ strbuf[len] = 0;
+ entry.name = strbuf;
+ /* prevent compiler from optimizing out the temporary buffer */
+ barrier();
}
- return f2fs_ci_compare(inode, name, &qstr, false);
+ res = utf8_strncasecmp(sbi->s_encoding, name, &entry);
+ if (res >= 0)
+ return res;
+
+ if (f2fs_has_strict_mode(sbi))
+ return -EINVAL;
+fallback:
+ if (len != name->len)
+ return 1;
+ return !!memcmp(str, name->name, len);
}
static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5c0149d2f46a..b35a50f4953c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/f2fs/f2fs.h
*
@@ -139,6 +139,7 @@ struct f2fs_mount_info {
int fs_mode; /* fs mode: LFS or ADAPTIVE */
int bggc_mode; /* bggc mode: off, on or sync */
struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */
+ block_t unusable_cap_perc; /* percentage for cap */
block_t unusable_cap; /* Amount of space allowed to be
* unusable when disabling checkpoint
*/
@@ -194,6 +195,7 @@ enum {
#define CP_DISCARD 0x00000010
#define CP_TRIMMED 0x00000020
#define CP_PAUSE 0x00000040
+#define CP_RESIZE 0x00000080
#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */
@@ -428,6 +430,10 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15)
#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64)
#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64)
+#define F2FS_IOC_RELEASE_COMPRESS_BLOCKS \
+ _IOR(F2FS_IOCTL_MAGIC, 18, __u64)
+#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \
+ _IOR(F2FS_IOCTL_MAGIC, 19, __u64)
#define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL
#define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL
@@ -506,6 +512,42 @@ static inline int get_inline_xattr_addrs(struct inode *inode);
* For INODE and NODE manager
*/
/* for directory operations */
+
+struct f2fs_filename {
+ /*
+ * The filename the user specified. This is NULL for some
+ * filesystem-internal operations, e.g. converting an inline directory
+ * to a non-inline one, or roll-forward recovering an encrypted dentry.
+ */
+ const struct qstr *usr_fname;
+
+ /*
+ * The on-disk filename. For encrypted directories, this is encrypted.
+ * This may be NULL for lookups in an encrypted dir without the key.
+ */
+ struct fscrypt_str disk_name;
+
+ /* The dirhash of this filename */
+ f2fs_hash_t hash;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ /*
+ * For lookups in encrypted directories: either the buffer backing
+ * disk_name, or a buffer that holds the decoded no-key name.
+ */
+ struct fscrypt_str crypto_buf;
+#endif
+#ifdef CONFIG_UNICODE
+ /*
+ * For casefolded directories: the casefolded name, but it's left NULL
+ * if the original name is not valid Unicode or if the filesystem is
+ * doing an internal operation where usr_fname is also NULL. In these
+ * cases we fall back to treating the name as an opaque byte sequence.
+ */
+ struct fscrypt_str cf_name;
+#endif
+};
+
struct f2fs_dentry_ptr {
struct inode *inode;
void *bitmap;
@@ -1088,8 +1130,9 @@ enum cp_reason_type {
};
enum iostat_type {
- APP_DIRECT_IO, /* app direct IOs */
- APP_BUFFERED_IO, /* app buffered IOs */
+ /* WRITE IO */
+ APP_DIRECT_IO, /* app direct write IOs */
+ APP_BUFFERED_IO, /* app buffered write IOs */
APP_WRITE_IO, /* app write IOs */
APP_MAPPED_IO, /* app mapped IOs */
FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */
@@ -1100,6 +1143,19 @@ enum iostat_type {
FS_CP_DATA_IO, /* data IOs from checkpoint */
FS_CP_NODE_IO, /* node IOs from checkpoint */
FS_CP_META_IO, /* meta IOs from checkpoint */
+
+ /* READ IO */
+ APP_DIRECT_READ_IO, /* app direct read IOs */
+ APP_BUFFERED_READ_IO, /* app buffered read IOs */
+ APP_READ_IO, /* app read IOs */
+ APP_MAPPED_READ_IO, /* app mapped read IOs */
+ FS_DATA_READ_IO, /* data read IOs */
+ FS_GDATA_READ_IO, /* data read IOs from background gc */
+ FS_CDATA_READ_IO, /* compressed data read IOs */
+ FS_NODE_READ_IO, /* node read IOs */
+ FS_META_READ_IO, /* meta read IOs */
+
+ /* other */
FS_DISCARD, /* discard */
NR_IO_TYPE,
};
@@ -1269,6 +1325,7 @@ enum compress_algorithm_type {
COMPRESS_LZO,
COMPRESS_LZ4,
COMPRESS_ZSTD,
+ COMPRESS_LZORLE,
COMPRESS_MAX,
};
@@ -1418,7 +1475,6 @@ struct f2fs_sb_info {
unsigned int segs_per_sec; /* segments per section */
unsigned int secs_per_zone; /* sections per zone */
unsigned int total_sections; /* total section count */
- struct mutex resize_mutex; /* for resize exclusion */
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
loff_t max_file_blocks; /* max block index of file */
@@ -1504,8 +1560,15 @@ struct f2fs_sb_info {
/* For app/fs IO statistics */
spinlock_t iostat_lock;
- unsigned long long write_iostat[NR_IO_TYPE];
+ unsigned long long rw_iostat[NR_IO_TYPE];
+ unsigned long long prev_rw_iostat[NR_IO_TYPE];
bool iostat_enable;
+ unsigned long iostat_next_period;
+ unsigned int iostat_period_ms;
+
+ /* to attach REQ_META|REQ_FUA flags */
+ unsigned int data_io_flag;
+ unsigned int node_io_flag;
/* For sysfs suppport */
struct kobject s_kobj;
@@ -2902,12 +2965,12 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
return is_set_ckpt_flags(sbi, CP_ERROR_FLAG);
}
-static inline bool is_dot_dotdot(const struct qstr *str)
+static inline bool is_dot_dotdot(const u8 *name, size_t len)
{
- if (str->len == 1 && str->name[0] == '.')
+ if (len == 1 && name[0] == '.')
return true;
- if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+ if (len == 2 && name[0] == '.' && name[1] == '.')
return true;
return false;
@@ -2935,18 +2998,12 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
- void *ret;
-
if (time_to_inject(sbi, FAULT_KMALLOC)) {
f2fs_show_injection_info(sbi, FAULT_KMALLOC);
return NULL;
}
- ret = kmalloc(size, flags);
- if (ret)
- return ret;
-
- return kvmalloc(size, flags);
+ return kmalloc(size, flags);
}
static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi,
@@ -2996,29 +3053,45 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
sizeof((f2fs_inode)->field)) \
<= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \
+#define DEFAULT_IOSTAT_PERIOD_MS 3000
+#define MIN_IOSTAT_PERIOD_MS 100
+/* maximum period of iostat tracing is 1 day */
+#define MAX_IOSTAT_PERIOD_MS 8640000
+
static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
{
int i;
spin_lock(&sbi->iostat_lock);
- for (i = 0; i < NR_IO_TYPE; i++)
- sbi->write_iostat[i] = 0;
+ for (i = 0; i < NR_IO_TYPE; i++) {
+ sbi->rw_iostat[i] = 0;
+ sbi->prev_rw_iostat[i] = 0;
+ }
spin_unlock(&sbi->iostat_lock);
}
+extern void f2fs_record_iostat(struct f2fs_sb_info *sbi);
+
static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
enum iostat_type type, unsigned long long io_bytes)
{
if (!sbi->iostat_enable)
return;
spin_lock(&sbi->iostat_lock);
- sbi->write_iostat[type] += io_bytes;
+ sbi->rw_iostat[type] += io_bytes;
if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
- sbi->write_iostat[APP_BUFFERED_IO] =
- sbi->write_iostat[APP_WRITE_IO] -
- sbi->write_iostat[APP_DIRECT_IO];
+ sbi->rw_iostat[APP_BUFFERED_IO] =
+ sbi->rw_iostat[APP_WRITE_IO] -
+ sbi->rw_iostat[APP_DIRECT_IO];
+
+ if (type == APP_READ_IO || type == APP_DIRECT_READ_IO)
+ sbi->rw_iostat[APP_BUFFERED_READ_IO] =
+ sbi->rw_iostat[APP_READ_IO] -
+ sbi->rw_iostat[APP_DIRECT_READ_IO];
spin_unlock(&sbi->iostat_lock);
+
+ f2fs_record_iostat(sbi);
}
#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1)
@@ -3064,6 +3137,7 @@ static inline void f2fs_clear_page_private(struct page *page)
*/
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
+int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock);
int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
int f2fs_truncate(struct inode *inode);
int f2fs_getattr(const struct path *path, struct kstat *stat,
@@ -3099,31 +3173,32 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
bool hot, bool set);
struct dentry *f2fs_get_parent(struct dentry *child);
-extern int f2fs_ci_compare(const struct inode *parent,
- const struct qstr *name,
- const struct qstr *entry,
- bool quick);
-
/*
* dir.c
*/
unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de);
-struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname,
- f2fs_hash_t namehash, int *max_slots,
- struct f2fs_dentry_ptr *d);
+int f2fs_init_casefolded_name(const struct inode *dir,
+ struct f2fs_filename *fname);
+int f2fs_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct f2fs_filename *fname);
+int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct f2fs_filename *fname);
+void f2fs_free_filename(struct f2fs_filename *fname);
+struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
+ const struct f2fs_filename *fname, int *max_slots);
int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
unsigned int start_pos, struct fscrypt_str *fstr);
void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent,
struct f2fs_dentry_ptr *d);
struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
- const struct qstr *new_name,
- const struct qstr *orig_name, struct page *dpage);
+ const struct f2fs_filename *fname, struct page *dpage);
void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth);
int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots);
void f2fs_drop_nlink(struct inode *dir, struct inode *inode);
struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
- struct fscrypt_name *fname, struct page **res_page);
+ const struct f2fs_filename *fname,
+ struct page **res_page);
struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
const struct qstr *child, struct page **res_page);
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p);
@@ -3132,14 +3207,13 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode);
bool f2fs_has_enough_room(struct inode *dir, struct page *ipage,
- struct fscrypt_name *fname);
+ const struct f2fs_filename *fname);
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
- const struct qstr *name, f2fs_hash_t name_hash,
+ const struct fscrypt_str *name, f2fs_hash_t name_hash,
unsigned int bit_pos);
-int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
- const struct qstr *orig_name,
+int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname,
struct inode *inode, nid_t ino, umode_t mode);
-int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname,
+int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname,
struct inode *inode, nid_t ino, umode_t mode);
int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode);
@@ -3169,8 +3243,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi);
/*
* hash.c
*/
-f2fs_hash_t f2fs_dentry_hash(const struct inode *dir,
- const struct qstr *name_info, struct fscrypt_name *fname);
+void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname);
/*
* node.c
@@ -3202,6 +3275,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
struct page *f2fs_get_node_page_ra(struct page *parent, int start);
int f2fs_move_node_page(struct page *node_page, int gc_type);
+int f2fs_flush_inline_data(struct f2fs_sb_info *sbi);
int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct writeback_control *wbc, bool atomic,
unsigned int *seq_id);
@@ -3645,7 +3719,7 @@ static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
static inline void __init f2fs_create_root_stats(void) { }
static inline void f2fs_destroy_root_stats(void) { }
-static inline void update_sit_info(struct f2fs_sb_info *sbi) {}
+static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {}
#endif
extern const struct file_operations f2fs_dir_operations;
@@ -3678,11 +3752,11 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry);
int f2fs_write_inline_data(struct inode *inode, struct page *page);
bool f2fs_recover_inline_data(struct inode *inode, struct page *npage);
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
- struct fscrypt_name *fname, struct page **res_page);
+ const struct f2fs_filename *fname,
+ struct page **res_page);
int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent,
struct page *ipage);
-int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
- const struct qstr *orig_name,
+int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
struct inode *inode, nid_t ino, umode_t mode);
void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry,
struct page *page, struct inode *dir,
@@ -3781,8 +3855,11 @@ int f2fs_prepare_compress_overwrite(struct inode *inode,
struct page **pagep, pgoff_t index, void **fsdata);
bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
pgoff_t index, unsigned copied);
+int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock);
void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
bool f2fs_is_compress_backend_ready(struct inode *inode);
+int f2fs_init_compress_mempool(void);
+void f2fs_destroy_compress_mempool(void);
void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
@@ -3816,6 +3893,8 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
WARN_ON_ONCE(1);
return ERR_PTR(-EINVAL);
}
+static inline int f2fs_init_compress_mempool(void) { return 0; }
+static inline void f2fs_destroy_compress_mempool(void) { }
#endif
static inline void set_compress_context(struct inode *inode)
@@ -3962,6 +4041,10 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
{
int diff = F2FS_I(inode)->i_cluster_size - blocks;
+ /* don't update i_compr_blocks if saved blocks were released */
+ if (!add && !F2FS_I(inode)->i_compr_blocks)
+ return;
+
if (add) {
F2FS_I(inode)->i_compr_blocks += diff;
stat_add_compr_blocks(inode, diff);
@@ -4003,8 +4086,6 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
return true;
if (f2fs_is_multi_device(sbi))
return true;
- if (f2fs_compressed_file(inode))
- return true;
/*
* for blkzoned device, fallback direct IO to buffered IO, so
* all IOs can be serialized by log-structured write.
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6ab8f621a3c5..3268f8dd59bb 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -40,6 +40,10 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
ret = filemap_fault(vmf);
up_read(&F2FS_I(inode)->i_mmap_sem);
+ if (!ret)
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO,
+ F2FS_BLKSIZE);
+
trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret);
return ret;
@@ -165,9 +169,11 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
{
struct dentry *dentry;
- inode = igrab(inode);
- dentry = d_find_any_alias(inode);
- iput(inode);
+ /*
+ * Make sure to get the non-deleted alias. The alias associated with
+ * the open file descriptor being fsync()'ed may be deleted already.
+ */
+ dentry = d_find_alias(inode);
if (!dentry)
return 0;
@@ -557,6 +563,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
bool compressed_cluster = false;
int cluster_index = 0, valid_blocks = 0;
int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
+ bool released = !F2FS_I(dn->inode)->i_compr_blocks;
if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
base = get_extra_isize(dn->inode);
@@ -595,7 +602,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
f2fs_invalidate_blocks(sbi, blkaddr);
- nr_free++;
+
+ if (!released || blkaddr != COMPRESS_ADDR)
+ nr_free++;
}
if (compressed_cluster)
@@ -643,9 +652,6 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
return 0;
}
- if (f2fs_compressed_file(inode))
- return 0;
-
page = f2fs_get_lock_data_page(inode, index, true);
if (IS_ERR(page))
return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
@@ -661,7 +667,7 @@ truncate_out:
return 0;
}
-static int do_truncate_blocks(struct inode *inode, u64 from, bool lock)
+int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
@@ -729,23 +735,28 @@ free_partial:
int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
{
u64 free_from = from;
+ int err;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
/*
* for compressed file, only support cluster size
* aligned truncation.
*/
- if (f2fs_compressed_file(inode)) {
- size_t cluster_shift = PAGE_SHIFT +
- F2FS_I(inode)->i_log_cluster_size;
- size_t cluster_mask = (1 << cluster_shift) - 1;
+ if (f2fs_compressed_file(inode))
+ free_from = round_up(from,
+ F2FS_I(inode)->i_cluster_size << PAGE_SHIFT);
+#endif
- free_from = from >> cluster_shift;
- if (from & cluster_mask)
- free_from++;
- free_from <<= cluster_shift;
- }
+ err = f2fs_do_truncate_blocks(inode, free_from, lock);
+ if (err)
+ return err;
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (from != free_from)
+ err = f2fs_truncate_partial_cluster(inode, from, lock);
+#endif
- return do_truncate_blocks(inode, free_from, lock);
+ return err;
}
int f2fs_truncate(struct inode *inode)
@@ -968,9 +979,7 @@ const struct inode_operations f2fs_file_inode_operations = {
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
-#ifdef CONFIG_F2FS_FS_XATTR
.listxattr = f2fs_listxattr,
-#endif
.fiemap = f2fs_fiemap,
};
@@ -1649,7 +1658,11 @@ next_alloc:
down_write(&sbi->pin_sem);
map.m_seg_type = CURSEG_COLD_DATA_PINNED;
+
+ f2fs_lock_op(sbi);
f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
+ f2fs_unlock_op(sbi);
+
err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
up_write(&sbi->pin_sem);
@@ -2219,8 +2232,15 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
if (in != F2FS_GOING_DOWN_FULLSYNC) {
ret = mnt_want_write_file(filp);
- if (ret)
+ if (ret) {
+ if (ret == -EROFS) {
+ ret = 0;
+ f2fs_stop_checkpoint(sbi, false);
+ set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
+ trace_f2fs_shutdown(sbi, in, ret);
+ }
return ret;
+ }
}
switch (in) {
@@ -3301,7 +3321,6 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp));
__u64 block_count;
- int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3313,9 +3332,7 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg)
sizeof(block_count)))
return -EFAULT;
- ret = f2fs_resize_fs(sbi, block_count);
-
- return ret;
+ return f2fs_resize_fs(sbi, block_count);
}
static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg)
@@ -3419,6 +3436,326 @@ static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg)
return put_user(blocks, (u64 __user *)arg);
}
+static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ unsigned int released_blocks = 0;
+ int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
+ block_t blkaddr;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + i);
+
+ if (!__is_valid_data_blkaddr(blkaddr))
+ continue;
+ if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
+ DATA_GENERIC_ENHANCE)))
+ return -EFSCORRUPTED;
+ }
+
+ while (count) {
+ int compr_blocks = 0;
+
+ for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
+ blkaddr = f2fs_data_blkaddr(dn);
+
+ if (i == 0) {
+ if (blkaddr == COMPRESS_ADDR)
+ continue;
+ dn->ofs_in_node += cluster_size;
+ goto next;
+ }
+
+ if (__is_valid_data_blkaddr(blkaddr))
+ compr_blocks++;
+
+ if (blkaddr != NEW_ADDR)
+ continue;
+
+ dn->data_blkaddr = NULL_ADDR;
+ f2fs_set_data_blkaddr(dn);
+ }
+
+ f2fs_i_compr_blocks_update(dn->inode, compr_blocks, false);
+ dec_valid_block_count(sbi, dn->inode,
+ cluster_size - compr_blocks);
+
+ released_blocks += cluster_size - compr_blocks;
+next:
+ count -= cluster_size;
+ }
+
+ return released_blocks;
+}
+
+static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ pgoff_t page_idx = 0, last_idx;
+ unsigned int released_blocks = 0;
+ int ret;
+ int writecount;
+
+ if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+ return -EOPNOTSUPP;
+
+ if (!f2fs_compressed_file(inode))
+ return -EINVAL;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
+
+ inode_lock(inode);
+
+ writecount = atomic_read(&inode->i_writecount);
+ if ((filp->f_mode & FMODE_WRITE && writecount != 1) || writecount) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (IS_IMMUTABLE(inode)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+ if (ret)
+ goto out;
+
+ if (!F2FS_I(inode)->i_compr_blocks)
+ goto out;
+
+ F2FS_I(inode)->i_flags |= F2FS_IMMUTABLE_FL;
+ f2fs_set_inode_flags(inode);
+ inode->i_ctime = current_time(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
+
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+
+ last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+
+ while (page_idx < last_idx) {
+ struct dnode_of_data dn;
+ pgoff_t end_offset, count;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE);
+ if (ret) {
+ if (ret == -ENOENT) {
+ page_idx = f2fs_get_next_page_offset(&dn,
+ page_idx);
+ ret = 0;
+ continue;
+ }
+ break;
+ }
+
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
+ count = round_up(count, F2FS_I(inode)->i_cluster_size);
+
+ ret = release_compress_blocks(&dn, count);
+
+ f2fs_put_dnode(&dn);
+
+ if (ret < 0)
+ break;
+
+ page_idx += count;
+ released_blocks += ret;
+ }
+
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+out:
+ inode_unlock(inode);
+
+ mnt_drop_write_file(filp);
+
+ if (ret >= 0) {
+ ret = put_user(released_blocks, (u64 __user *)arg);
+ } else if (released_blocks && F2FS_I(inode)->i_compr_blocks) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx "
+ "iblocks=%llu, released=%u, compr_blocks=%llu, "
+ "run fsck to fix.",
+ __func__, inode->i_ino, inode->i_blocks,
+ released_blocks,
+ F2FS_I(inode)->i_compr_blocks);
+ }
+
+ return ret;
+}
+
+static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ unsigned int reserved_blocks = 0;
+ int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
+ block_t blkaddr;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + i);
+
+ if (!__is_valid_data_blkaddr(blkaddr))
+ continue;
+ if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
+ DATA_GENERIC_ENHANCE)))
+ return -EFSCORRUPTED;
+ }
+
+ while (count) {
+ int compr_blocks = 0;
+ blkcnt_t reserved;
+ int ret;
+
+ for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
+ blkaddr = f2fs_data_blkaddr(dn);
+
+ if (i == 0) {
+ if (blkaddr == COMPRESS_ADDR)
+ continue;
+ dn->ofs_in_node += cluster_size;
+ goto next;
+ }
+
+ if (__is_valid_data_blkaddr(blkaddr)) {
+ compr_blocks++;
+ continue;
+ }
+
+ dn->data_blkaddr = NEW_ADDR;
+ f2fs_set_data_blkaddr(dn);
+ }
+
+ reserved = cluster_size - compr_blocks;
+ ret = inc_valid_block_count(sbi, dn->inode, &reserved);
+ if (ret)
+ return ret;
+
+ if (reserved != cluster_size - compr_blocks)
+ return -ENOSPC;
+
+ f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true);
+
+ reserved_blocks += reserved;
+next:
+ count -= cluster_size;
+ }
+
+ return reserved_blocks;
+}
+
+static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ pgoff_t page_idx = 0, last_idx;
+ unsigned int reserved_blocks = 0;
+ int ret;
+
+ if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+ return -EOPNOTSUPP;
+
+ if (!f2fs_compressed_file(inode))
+ return -EINVAL;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ if (F2FS_I(inode)->i_compr_blocks)
+ goto out;
+
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
+
+ inode_lock(inode);
+
+ if (!IS_IMMUTABLE(inode)) {
+ ret = -EINVAL;
+ goto unlock_inode;
+ }
+
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+
+ last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+
+ while (page_idx < last_idx) {
+ struct dnode_of_data dn;
+ pgoff_t end_offset, count;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE);
+ if (ret) {
+ if (ret == -ENOENT) {
+ page_idx = f2fs_get_next_page_offset(&dn,
+ page_idx);
+ ret = 0;
+ continue;
+ }
+ break;
+ }
+
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
+ count = round_up(count, F2FS_I(inode)->i_cluster_size);
+
+ ret = reserve_compress_blocks(&dn, count);
+
+ f2fs_put_dnode(&dn);
+
+ if (ret < 0)
+ break;
+
+ page_idx += count;
+ reserved_blocks += ret;
+ }
+
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+
+ if (ret >= 0) {
+ F2FS_I(inode)->i_flags &= ~F2FS_IMMUTABLE_FL;
+ f2fs_set_inode_flags(inode);
+ inode->i_ctime = current_time(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
+ }
+unlock_inode:
+ inode_unlock(inode);
+out:
+ mnt_drop_write_file(filp);
+
+ if (ret >= 0) {
+ ret = put_user(reserved_blocks, (u64 __user *)arg);
+ } else if (reserved_blocks && F2FS_I(inode)->i_compr_blocks) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx "
+ "iblocks=%llu, reserved=%u, compr_blocks=%llu, "
+ "run fsck to fix.",
+ __func__, inode->i_ino, inode->i_blocks,
+ reserved_blocks,
+ F2FS_I(inode)->i_compr_blocks);
+ }
+
+ return ret;
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -3501,6 +3838,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_set_volume_name(filp, arg);
case F2FS_IOC_GET_COMPRESS_BLOCKS:
return f2fs_get_compress_blocks(filp, arg);
+ case F2FS_IOC_RELEASE_COMPRESS_BLOCKS:
+ return f2fs_release_compress_blocks(filp, arg);
+ case F2FS_IOC_RESERVE_COMPRESS_BLOCKS:
+ return f2fs_reserve_compress_blocks(filp, arg);
default:
return -ENOTTY;
}
@@ -3510,11 +3851,17 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ int ret;
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- return generic_file_read_iter(iocb, iter);
+ ret = generic_file_read_iter(iocb, iter);
+
+ if (ret > 0)
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_READ_IO, ret);
+
+ return ret;
}
static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -3662,6 +4009,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_GET_VOLUME_NAME:
case F2FS_IOC_SET_VOLUME_NAME:
case F2FS_IOC_GET_COMPRESS_BLOCKS:
+ case F2FS_IOC_RELEASE_COMPRESS_BLOCKS:
+ case F2FS_IOC_RESERVE_COMPRESS_BLOCKS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 26248c8936db..5b95d5a146eb 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -13,6 +13,7 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/freezer.h>
+#include <linux/sched/signal.h>
#include "f2fs.h"
#include "node.h"
@@ -737,6 +738,10 @@ got_it:
goto put_encrypted_page;
f2fs_put_page(fio.encrypted_page, 0);
f2fs_put_page(page, 1);
+
+ f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
+
return 0;
put_encrypted_page:
f2fs_put_page(fio.encrypted_page, 1);
@@ -840,6 +845,10 @@ static int move_data_block(struct inode *inode, block_t bidx,
f2fs_put_page(mpage, 1);
goto up_out;
}
+
+ f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
+
lock_page(mpage);
if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) ||
!PageUptodate(mpage))) {
@@ -1399,12 +1408,29 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
}
-static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start,
- unsigned int end)
+static int free_segment_range(struct f2fs_sb_info *sbi,
+ unsigned int secs, bool gc_only)
{
- int type;
- unsigned int segno, next_inuse;
+ unsigned int segno, next_inuse, start, end;
+ struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
+ int gc_mode, gc_type;
int err = 0;
+ int type;
+
+ /* Force block allocation for GC */
+ MAIN_SECS(sbi) -= secs;
+ start = MAIN_SECS(sbi) * sbi->segs_per_sec;
+ end = MAIN_SEGS(sbi) - 1;
+
+ mutex_lock(&DIRTY_I(sbi)->seglist_lock);
+ for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++)
+ if (SIT_I(sbi)->last_victim[gc_mode] >= start)
+ SIT_I(sbi)->last_victim[gc_mode] = 0;
+
+ for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++)
+ if (sbi->next_victim_seg[gc_type] >= start)
+ sbi->next_victim_seg[gc_type] = NULL_SEGNO;
+ mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
/* Move out cursegs from the target range */
for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++)
@@ -1417,18 +1443,24 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start,
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
- down_write(&sbi->gc_lock);
do_garbage_collect(sbi, segno, &gc_list, FG_GC);
- up_write(&sbi->gc_lock);
put_gc_inode(&gc_list);
- if (get_valid_blocks(sbi, segno, true))
- return -EAGAIN;
+ if (!gc_only && get_valid_blocks(sbi, segno, true)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ goto out;
+ }
}
+ if (gc_only)
+ goto out;
- err = f2fs_sync_fs(sbi->sb, 1);
+ err = f2fs_write_checkpoint(sbi, &cpc);
if (err)
- return err;
+ goto out;
next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start);
if (next_inuse <= end) {
@@ -1436,6 +1468,8 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start,
next_inuse);
f2fs_bug_on(sbi, 1);
}
+out:
+ MAIN_SECS(sbi) += secs;
return err;
}
@@ -1481,6 +1515,7 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs;
MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs;
+ MAIN_SECS(sbi) += secs;
FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs;
FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs;
F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks);
@@ -1502,8 +1537,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
{
__u64 old_block_count, shrunk_blocks;
+ struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
unsigned int secs;
- int gc_mode, gc_type;
int err = 0;
__u32 rem;
@@ -1538,10 +1573,27 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
return -EINVAL;
}
- freeze_bdev(sbi->sb->s_bdev);
-
shrunk_blocks = old_block_count - block_count;
secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi));
+
+ /* stop other GC */
+ if (!down_write_trylock(&sbi->gc_lock))
+ return -EAGAIN;
+
+ /* stop CP to protect MAIN_SEC in free_segment_range */
+ f2fs_lock_op(sbi);
+ err = free_segment_range(sbi, secs, true);
+ f2fs_unlock_op(sbi);
+ up_write(&sbi->gc_lock);
+ if (err)
+ return err;
+
+ set_sbi_flag(sbi, SBI_IS_RESIZEFS);
+
+ freeze_super(sbi->sb);
+ down_write(&sbi->gc_lock);
+ mutex_lock(&sbi->cp_mutex);
+
spin_lock(&sbi->stat_lock);
if (shrunk_blocks + valid_user_blocks(sbi) +
sbi->current_reserved_blocks + sbi->unusable_block_count +
@@ -1550,69 +1602,44 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
else
sbi->user_block_count -= shrunk_blocks;
spin_unlock(&sbi->stat_lock);
- if (err) {
- thaw_bdev(sbi->sb->s_bdev, sbi->sb);
- return err;
- }
-
- mutex_lock(&sbi->resize_mutex);
- set_sbi_flag(sbi, SBI_IS_RESIZEFS);
-
- mutex_lock(&DIRTY_I(sbi)->seglist_lock);
-
- MAIN_SECS(sbi) -= secs;
-
- for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++)
- if (SIT_I(sbi)->last_victim[gc_mode] >=
- MAIN_SECS(sbi) * sbi->segs_per_sec)
- SIT_I(sbi)->last_victim[gc_mode] = 0;
-
- for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++)
- if (sbi->next_victim_seg[gc_type] >=
- MAIN_SECS(sbi) * sbi->segs_per_sec)
- sbi->next_victim_seg[gc_type] = NULL_SEGNO;
-
- mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
+ if (err)
+ goto out_err;
- err = free_segment_range(sbi, MAIN_SECS(sbi) * sbi->segs_per_sec,
- MAIN_SEGS(sbi) - 1);
+ err = free_segment_range(sbi, secs, false);
if (err)
- goto out;
+ goto recover_out;
update_sb_metadata(sbi, -secs);
err = f2fs_commit_super(sbi, false);
if (err) {
update_sb_metadata(sbi, secs);
- goto out;
+ goto recover_out;
}
- mutex_lock(&sbi->cp_mutex);
update_fs_metadata(sbi, -secs);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
set_sbi_flag(sbi, SBI_IS_DIRTY);
- mutex_unlock(&sbi->cp_mutex);
- err = f2fs_sync_fs(sbi->sb, 1);
+ err = f2fs_write_checkpoint(sbi, &cpc);
if (err) {
- mutex_lock(&sbi->cp_mutex);
update_fs_metadata(sbi, secs);
- mutex_unlock(&sbi->cp_mutex);
update_sb_metadata(sbi, secs);
f2fs_commit_super(sbi, false);
}
-out:
+recover_out:
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_err(sbi, "resize_fs failed, should run fsck to repair!");
- MAIN_SECS(sbi) += secs;
spin_lock(&sbi->stat_lock);
sbi->user_block_count += shrunk_blocks;
spin_unlock(&sbi->stat_lock);
}
+out_err:
+ mutex_unlock(&sbi->cp_mutex);
+ up_write(&sbi->gc_lock);
+ thaw_super(sbi->sb);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
- mutex_unlock(&sbi->resize_mutex);
- thaw_bdev(sbi->sb->s_bdev, sbi->sb);
return err;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index bbac9d3787bd..db3c61046aa4 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/f2fs/gc.h
*
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 8c4ea5003ef8..de841aaf3c43 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -67,22 +67,9 @@ static void str2hashbuf(const unsigned char *msg, size_t len,
*buf++ = pad;
}
-static f2fs_hash_t __f2fs_dentry_hash(const struct qstr *name_info,
- struct fscrypt_name *fname)
+static u32 TEA_hash_name(const u8 *p, size_t len)
{
- __u32 hash;
- f2fs_hash_t f2fs_hash;
- const unsigned char *p;
__u32 in[8], buf[4];
- const unsigned char *name = name_info->name;
- size_t len = name_info->len;
-
- /* encrypted bigname case */
- if (fname && !fname->disk_name.name)
- return cpu_to_le32(fname->hash);
-
- if (is_dot_dotdot(name_info))
- return 0;
/* Initialize the default seed for the hash checksum functions */
buf[0] = 0x67452301;
@@ -90,7 +77,6 @@ static f2fs_hash_t __f2fs_dentry_hash(const struct qstr *name_info,
buf[2] = 0x98badcfe;
buf[3] = 0x10325476;
- p = name;
while (1) {
str2hashbuf(p, len, in, 4);
TEA_transform(buf, in);
@@ -99,41 +85,43 @@ static f2fs_hash_t __f2fs_dentry_hash(const struct qstr *name_info,
break;
len -= 16;
}
- hash = buf[0];
- f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT);
- return f2fs_hash;
+ return buf[0] & ~F2FS_HASH_COL_BIT;
}
-f2fs_hash_t f2fs_dentry_hash(const struct inode *dir,
- const struct qstr *name_info, struct fscrypt_name *fname)
+/*
+ * Compute @fname->hash. For all directories, @fname->disk_name must be set.
+ * For casefolded directories, @fname->usr_fname must be set, and also
+ * @fname->cf_name if the filename is valid Unicode.
+ */
+void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname)
{
-#ifdef CONFIG_UNICODE
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
- const struct unicode_map *um = sbi->s_encoding;
- int r, dlen;
- unsigned char *buff;
- struct qstr folded;
+ const u8 *name = fname->disk_name.name;
+ size_t len = fname->disk_name.len;
- if (!name_info->len || !IS_CASEFOLDED(dir))
- goto opaque_seq;
+ WARN_ON_ONCE(!name);
- buff = f2fs_kzalloc(sbi, sizeof(char) * PATH_MAX, GFP_KERNEL);
- if (!buff)
- return -ENOMEM;
-
- dlen = utf8_casefold(um, name_info, buff, PATH_MAX);
- if (dlen < 0) {
- kvfree(buff);
- goto opaque_seq;
+ if (is_dot_dotdot(name, len)) {
+ fname->hash = 0;
+ return;
}
- folded.name = buff;
- folded.len = dlen;
- r = __f2fs_dentry_hash(&folded, fname);
-
- kvfree(buff);
- return r;
-opaque_seq:
+#ifdef CONFIG_UNICODE
+ if (IS_CASEFOLDED(dir)) {
+ /*
+ * If the casefolded name is provided, hash it instead of the
+ * on-disk name. If the casefolded name is *not* provided, that
+ * should only be because the name wasn't valid Unicode, so fall
+ * back to treating the name as an opaque byte sequence.
+ */
+ WARN_ON_ONCE(!fname->usr_fname->name);
+ if (fname->cf_name.name) {
+ name = fname->cf_name.name;
+ len = fname->cf_name.len;
+ } else {
+ name = fname->usr_fname->name;
+ len = fname->usr_fname->len;
+ }
+ }
#endif
- return __f2fs_dentry_hash(name_info, fname);
+ fname->hash = cpu_to_le32(TEA_hash_name(name, len));
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 9686ffea177e..dbade310dc79 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -306,15 +306,14 @@ process_inline:
}
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
- struct fscrypt_name *fname, struct page **res_page)
+ const struct f2fs_filename *fname,
+ struct page **res_page)
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
- struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
struct f2fs_dir_entry *de;
struct f2fs_dentry_ptr d;
struct page *ipage;
void *inline_dentry;
- f2fs_hash_t namehash;
ipage = f2fs_get_node_page(sbi, dir->i_ino);
if (IS_ERR(ipage)) {
@@ -322,12 +321,10 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
return NULL;
}
- namehash = f2fs_dentry_hash(dir, &name, fname);
-
inline_dentry = inline_data_addr(dir, ipage);
make_dentry_ptr_inline(dir, &d, inline_dentry);
- de = f2fs_find_target_dentry(fname, namehash, NULL, &d);
+ de = f2fs_find_target_dentry(&d, fname, NULL);
unlock_page(ipage);
if (de)
*res_page = ipage;
@@ -444,7 +441,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
while (bit_pos < d.max) {
struct f2fs_dir_entry *de;
- struct qstr new_name;
+ struct f2fs_filename fname;
nid_t ino;
umode_t fake_mode;
@@ -460,14 +457,19 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
continue;
}
- new_name.name = d.filename[bit_pos];
- new_name.len = le16_to_cpu(de->name_len);
+ /*
+ * We only need the disk_name and hash to move the dentry.
+ * We don't need the original or casefolded filenames.
+ */
+ memset(&fname, 0, sizeof(fname));
+ fname.disk_name.name = d.filename[bit_pos];
+ fname.disk_name.len = le16_to_cpu(de->name_len);
+ fname.hash = de->hash_code;
ino = le32_to_cpu(de->ino);
fake_mode = f2fs_get_de_type(de) << S_SHIFT;
- err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL,
- ino, fake_mode);
+ err = f2fs_add_regular_entry(dir, &fname, NULL, ino, fake_mode);
if (err)
goto punch_dentry_pages;
@@ -544,7 +546,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct page *ipage;
- struct fscrypt_name fname;
+ struct f2fs_filename fname;
void *inline_dentry = NULL;
int err = 0;
@@ -553,19 +555,19 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry)
f2fs_lock_op(sbi);
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ err = f2fs_setup_filename(dir, &dentry->d_name, 0, &fname);
if (err)
goto out;
ipage = f2fs_get_node_page(sbi, dir->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
- goto out;
+ goto out_fname;
}
if (f2fs_has_enough_room(dir, ipage, &fname)) {
f2fs_put_page(ipage, 1);
- goto out;
+ goto out_fname;
}
inline_dentry = inline_data_addr(dir, ipage);
@@ -573,22 +575,22 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry)
err = do_convert_inline_dir(dir, ipage, inline_dentry);
if (!err)
f2fs_put_page(ipage, 1);
+out_fname:
+ f2fs_free_filename(&fname);
out:
f2fs_unlock_op(sbi);
return err;
}
-int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
- const struct qstr *orig_name,
- struct inode *inode, nid_t ino, umode_t mode)
+int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
+ struct inode *inode, nid_t ino, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct page *ipage;
unsigned int bit_pos;
- f2fs_hash_t name_hash;
void *inline_dentry = NULL;
struct f2fs_dentry_ptr d;
- int slots = GET_DENTRY_SLOTS(new_name->len);
+ int slots = GET_DENTRY_SLOTS(fname->disk_name.len);
struct page *page = NULL;
int err = 0;
@@ -610,8 +612,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
- page = f2fs_init_inode_metadata(inode, dir, new_name,
- orig_name, ipage);
+ page = f2fs_init_inode_metadata(inode, dir, fname, ipage);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -620,8 +621,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
- name_hash = f2fs_dentry_hash(dir, new_name, NULL);
- f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
+ f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash,
+ bit_pos);
set_page_dirty(ipage);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index f54119da2217..e94e02c6580a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -482,7 +482,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
nid_t ino = -1;
int err = 0;
unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
- struct fscrypt_name fname;
+ struct f2fs_filename fname;
trace_f2fs_lookup_start(dir, dentry, flags);
@@ -491,19 +491,20 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- err = fscrypt_prepare_lookup(dir, dentry, &fname);
+ err = f2fs_prepare_lookup(dir, dentry, &fname);
if (err == -ENOENT)
goto out_splice;
if (err)
goto out;
de = __f2fs_find_entry(dir, &fname, &page);
- fscrypt_free_filename(&fname);
+ f2fs_free_filename(&fname);
if (!de) {
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto out;
}
+ err = -ENOENT;
goto out_splice;
}
@@ -549,7 +550,7 @@ out_splice:
#endif
new = d_splice_alias(inode, dentry);
err = PTR_ERR_OR_ZERO(new);
- trace_f2fs_lookup_end(dir, dentry, ino, err);
+ trace_f2fs_lookup_end(dir, dentry, ino, !new ? -ENOENT : err);
return new;
out_iput:
iput(inode);
@@ -564,7 +565,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
struct f2fs_dir_entry *de;
struct page *page;
- int err = -ENOENT;
+ int err;
trace_f2fs_unlink_enter(dir, dentry);
@@ -1287,9 +1288,7 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.get_link = f2fs_encrypted_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
-#ifdef CONFIG_F2FS_FS_XATTR
.listxattr = f2fs_listxattr,
-#endif
};
const struct inode_operations f2fs_dir_inode_operations = {
@@ -1307,9 +1306,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
-#ifdef CONFIG_F2FS_FS_XATTR
.listxattr = f2fs_listxattr,
-#endif
.fiemap = f2fs_fiemap,
};
@@ -1317,9 +1314,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
.get_link = f2fs_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
-#ifdef CONFIG_F2FS_FS_XATTR
.listxattr = f2fs_listxattr,
-#endif
};
const struct inode_operations f2fs_special_inode_operations = {
@@ -1327,7 +1322,5 @@ const struct inode_operations f2fs_special_inode_operations = {
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
-#ifdef CONFIG_F2FS_FS_XATTR
.listxattr = f2fs_listxattr,
-#endif
};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ecbd6bd14a49..03e24df1c84f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1300,7 +1300,13 @@ static int read_node_page(struct page *page, int op_flags)
}
fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
- return f2fs_submit_page_bio(&fio);
+
+ err = f2fs_submit_page_bio(&fio);
+
+ if (!err)
+ f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE);
+
+ return err;
}
/*
@@ -1514,8 +1520,15 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
trace_f2fs_writepage(page, NODE);
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
+ ClearPageUptodate(page);
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ unlock_page(page);
+ return 0;
+ }
goto redirty_out;
+ }
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
@@ -1801,6 +1814,53 @@ static bool flush_dirty_inode(struct page *page)
return true;
}
+int f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
+{
+ pgoff_t index = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ int ret = 0;
+
+ pagevec_init(&pvec);
+
+ while ((nr_pages = pagevec_lookup_tag(&pvec,
+ NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) {
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (!IS_DNODE(page))
+ continue;
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ /* flush inline_data, if it's async context. */
+ if (is_inline_node(page)) {
+ clear_inline_node(page);
+ unlock_page(page);
+ flush_inline_data(sbi, ino_of_node(page));
+ continue;
+ }
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return ret;
+}
+
int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
struct writeback_control *wbc,
bool do_balance, enum iostat_type io_type)
@@ -1864,8 +1924,8 @@ continue_unlock:
goto continue_unlock;
}
- /* flush inline_data */
- if (is_inline_node(page)) {
+ /* flush inline_data, if it's async context. */
+ if (do_balance && is_inline_node(page)) {
clear_inline_node(page);
unlock_page(page);
flush_inline_data(sbi, ino_of_node(page));
@@ -2482,7 +2542,6 @@ void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct free_nid *i, *next;
int nr = nr_shrink;
if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
@@ -2491,17 +2550,23 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
if (!mutex_trylock(&nm_i->build_lock))
return 0;
- spin_lock(&nm_i->nid_list_lock);
- list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
- if (nr_shrink <= 0 ||
- nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
- break;
+ while (nr_shrink && nm_i->nid_cnt[FREE_NID] > MAX_FREE_NIDS) {
+ struct free_nid *i, *next;
+ unsigned int batch = SHRINK_NID_BATCH_SIZE;
- __remove_free_nid(sbi, i, FREE_NID);
- kmem_cache_free(free_nid_slab, i);
- nr_shrink--;
+ spin_lock(&nm_i->nid_list_lock);
+ list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
+ if (!nr_shrink || !batch ||
+ nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
+ break;
+ __remove_free_nid(sbi, i, FREE_NID);
+ kmem_cache_free(free_nid_slab, i);
+ nr_shrink--;
+ batch--;
+ }
+ spin_unlock(&nm_i->nid_list_lock);
}
- spin_unlock(&nm_i->nid_list_lock);
+
mutex_unlock(&nm_i->build_lock);
return nr - nr_shrink;
@@ -2928,7 +2993,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
return 0;
nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
- nm_i->nat_bits = f2fs_kzalloc(sbi,
+ nm_i->nat_bits = f2fs_kvzalloc(sbi,
nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
if (!nm_i->nat_bits)
return -ENOMEM;
@@ -3061,9 +3126,9 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi)
int i;
nm_i->free_nid_bitmap =
- f2fs_kzalloc(sbi, array_size(sizeof(unsigned char *),
- nm_i->nat_blocks),
- GFP_KERNEL);
+ f2fs_kvzalloc(sbi, array_size(sizeof(unsigned char *),
+ nm_i->nat_blocks),
+ GFP_KERNEL);
if (!nm_i->free_nid_bitmap)
return -ENOMEM;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index e05af5df5648..69e5859e993c 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/f2fs/node.h
*
@@ -15,6 +15,9 @@
#define FREE_NID_PAGES 8
#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
+/* size of free nid batch when shrinking */
+#define SHRINK_NID_BATCH_SIZE 8
+
#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */
/* maximum readahead size for node during getting data blocks */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index dd804c07eeb0..ae5310f02e7f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -107,13 +107,51 @@ static void del_fsync_inode(struct fsync_inode_entry *entry, int drop)
kmem_cache_free(fsync_entry_slab, entry);
}
+static int init_recovered_filename(const struct inode *dir,
+ struct f2fs_inode *raw_inode,
+ struct f2fs_filename *fname,
+ struct qstr *usr_fname)
+{
+ int err;
+
+ memset(fname, 0, sizeof(*fname));
+ fname->disk_name.len = le32_to_cpu(raw_inode->i_namelen);
+ fname->disk_name.name = raw_inode->i_name;
+
+ if (WARN_ON(fname->disk_name.len > F2FS_NAME_LEN))
+ return -ENAMETOOLONG;
+
+ if (!IS_ENCRYPTED(dir)) {
+ usr_fname->name = fname->disk_name.name;
+ usr_fname->len = fname->disk_name.len;
+ fname->usr_fname = usr_fname;
+ }
+
+ /* Compute the hash of the filename */
+ if (IS_CASEFOLDED(dir)) {
+ err = f2fs_init_casefolded_name(dir, fname);
+ if (err)
+ return err;
+ f2fs_hash_filename(dir, fname);
+#ifdef CONFIG_UNICODE
+ /* Case-sensitive match is fine for recovery */
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+#endif
+ } else {
+ f2fs_hash_filename(dir, fname);
+ }
+ return 0;
+}
+
static int recover_dentry(struct inode *inode, struct page *ipage,
struct list_head *dir_list)
{
struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
- struct fscrypt_name fname;
+ struct f2fs_filename fname;
+ struct qstr usr_fname;
struct page *page;
struct inode *dir, *einode;
struct fsync_inode_entry *entry;
@@ -132,16 +170,9 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
}
dir = entry->inode;
-
- memset(&fname, 0, sizeof(struct fscrypt_name));
- fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen);
- fname.disk_name.name = raw_inode->i_name;
-
- if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) {
- WARN_ON(1);
- err = -ENAMETOOLONG;
+ err = init_recovered_filename(dir, raw_inode, &fname, &usr_fname);
+ if (err)
goto out;
- }
retry:
de = __f2fs_find_entry(dir, &fname, &page);
if (de && inode->i_ino == le32_to_cpu(de->ino))
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index b7a9421472a7..196f31503511 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1029,9 +1029,9 @@ static void f2fs_submit_discard_endio(struct bio *bio)
struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
unsigned long flags;
- dc->error = blk_status_to_errno(bio->bi_status);
-
spin_lock_irqsave(&dc->lock, flags);
+ if (!dc->error)
+ dc->error = blk_status_to_errno(bio->bi_status);
dc->bio_ref--;
if (!dc->bio_ref && dc->state == D_SUBMIT) {
dc->state = D_DONE;
@@ -1101,7 +1101,6 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
} else if (discard_type == DPOLICY_FSTRIM) {
dpolicy->io_aware = false;
} else if (discard_type == DPOLICY_UMOUNT) {
- dpolicy->max_requests = UINT_MAX;
dpolicy->io_aware = false;
/* we need to issue all to keep CP_TRIMMED_FLAG */
dpolicy->granularity = 1;
@@ -1215,12 +1214,14 @@ submit:
len = total_len;
}
- if (!err && len)
+ if (!err && len) {
+ dcc->undiscard_blks -= len;
__update_discard_tree_range(sbi, bdev, lstart, start, len);
+ }
return err;
}
-static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
+static void __insert_discard_tree(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t lstart,
block_t start, block_t len,
struct rb_node **insert_p,
@@ -1229,7 +1230,6 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct rb_node **p;
struct rb_node *parent = NULL;
- struct discard_cmd *dc = NULL;
bool leftmost = true;
if (insert_p && insert_parent) {
@@ -1241,12 +1241,8 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent,
lstart, &leftmost);
do_insert:
- dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent,
+ __attach_discard_cmd(sbi, bdev, lstart, start, len, parent,
p, leftmost);
- if (!dc)
- return NULL;
-
- return dc;
}
static void __relocate_discard_cmd(struct discard_cmd_control *dcc,
@@ -1463,6 +1459,8 @@ next:
return issued;
}
+static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_policy *dpolicy);
static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
struct discard_policy *dpolicy)
@@ -1471,12 +1469,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
struct list_head *pend_list;
struct discard_cmd *dc, *tmp;
struct blk_plug plug;
- int i, issued = 0;
+ int i, issued;
bool io_interrupted = false;
if (dpolicy->timeout)
f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT);
+retry:
+ issued = 0;
for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
if (dpolicy->timeout &&
f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
@@ -1523,6 +1523,11 @@ next:
break;
}
+ if (dpolicy->type == DPOLICY_UMOUNT && issued) {
+ __wait_all_discard_cmd(sbi, dpolicy);
+ goto retry;
+ }
+
if (!issued && io_interrupted)
issued = -1;
@@ -3102,6 +3107,14 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
type = CURSEG_COLD_DATA;
}
+ /*
+ * We need to wait for node_write to avoid block allocation during
+ * checkpoint. This can only happen to quota writes which can cause
+ * the below discard race condition.
+ */
+ if (IS_DATASEG(type))
+ down_write(&sbi->node_write);
+
down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
@@ -3167,6 +3180,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
up_read(&SM_I(sbi)->curseg_lock);
+ if (IS_DATASEG(type))
+ up_write(&sbi->node_write);
+
if (put_pin_sem)
up_read(&sbi->pin_sem);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7a83bd530812..cba16cca5189 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/f2fs/segment.h
*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8a9955902d84..20e56b0fa46a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -285,6 +285,22 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).s_resgid));
}
+static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi)
+{
+ if (!F2FS_OPTION(sbi).unusable_cap_perc)
+ return;
+
+ if (F2FS_OPTION(sbi).unusable_cap_perc == 100)
+ F2FS_OPTION(sbi).unusable_cap = sbi->user_block_count;
+ else
+ F2FS_OPTION(sbi).unusable_cap = (sbi->user_block_count / 100) *
+ F2FS_OPTION(sbi).unusable_cap_perc;
+
+ f2fs_info(sbi, "Adjust unusable cap for checkpoint=disable = %u / %u%%",
+ F2FS_OPTION(sbi).unusable_cap,
+ F2FS_OPTION(sbi).unusable_cap_perc);
+}
+
static void init_once(void *foo)
{
struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
@@ -471,11 +487,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
if (!name)
return -ENOMEM;
- if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
+ if (!strcmp(name, "on")) {
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
- } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
+ } else if (!strcmp(name, "off")) {
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF;
- } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
+ } else if (!strcmp(name, "sync")) {
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC;
} else {
kvfree(name);
@@ -635,16 +651,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
if (!name)
return -ENOMEM;
- if (strlen(name) == 8 &&
- !strncmp(name, "adaptive", 8)) {
+ if (!strcmp(name, "adaptive")) {
if (f2fs_sb_has_blkzoned(sbi)) {
f2fs_warn(sbi, "adaptive mode is not allowed with zoned block device feature");
kvfree(name);
return -EINVAL;
}
F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
- } else if (strlen(name) == 3 &&
- !strncmp(name, "lfs", 3)) {
+ } else if (!strcmp(name, "lfs")) {
F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
} else {
kvfree(name);
@@ -769,14 +783,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
name = match_strdup(&args[0]);
if (!name)
return -ENOMEM;
- if (strlen(name) == 10 &&
- !strncmp(name, "user-based", 10)) {
+ if (!strcmp(name, "user-based")) {
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER;
- } else if (strlen(name) == 3 &&
- !strncmp(name, "off", 3)) {
+ } else if (!strcmp(name, "off")) {
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
- } else if (strlen(name) == 8 &&
- !strncmp(name, "fs-based", 8)) {
+ } else if (!strcmp(name, "fs-based")) {
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
} else {
kvfree(name);
@@ -789,11 +800,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
if (!name)
return -ENOMEM;
- if (strlen(name) == 7 &&
- !strncmp(name, "default", 7)) {
+ if (!strcmp(name, "default")) {
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
- } else if (strlen(name) == 5 &&
- !strncmp(name, "reuse", 5)) {
+ } else if (!strcmp(name, "reuse")) {
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
} else {
kvfree(name);
@@ -805,14 +814,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
name = match_strdup(&args[0]);
if (!name)
return -ENOMEM;
- if (strlen(name) == 5 &&
- !strncmp(name, "posix", 5)) {
+ if (!strcmp(name, "posix")) {
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
- } else if (strlen(name) == 6 &&
- !strncmp(name, "strict", 6)) {
+ } else if (!strcmp(name, "strict")) {
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT;
- } else if (strlen(name) == 9 &&
- !strncmp(name, "nobarrier", 9)) {
+ } else if (!strcmp(name, "nobarrier")) {
F2FS_OPTION(sbi).fsync_mode =
FSYNC_MODE_NOBARRIER;
} else {
@@ -832,12 +838,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
return -EINVAL;
if (arg < 0 || arg > 100)
return -EINVAL;
- if (arg == 100)
- F2FS_OPTION(sbi).unusable_cap =
- sbi->user_block_count;
- else
- F2FS_OPTION(sbi).unusable_cap =
- (sbi->user_block_count / 100) * arg;
+ F2FS_OPTION(sbi).unusable_cap_perc = arg;
set_opt(sbi, DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_disable_cap:
@@ -860,17 +861,18 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
name = match_strdup(&args[0]);
if (!name)
return -ENOMEM;
- if (strlen(name) == 3 && !strcmp(name, "lzo")) {
+ if (!strcmp(name, "lzo")) {
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_LZO;
- } else if (strlen(name) == 3 &&
- !strcmp(name, "lz4")) {
+ } else if (!strcmp(name, "lz4")) {
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_LZ4;
- } else if (strlen(name) == 4 &&
- !strcmp(name, "zstd")) {
+ } else if (!strcmp(name, "zstd")) {
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_ZSTD;
+ } else if (!strcmp(name, "lzo-rle")) {
+ F2FS_OPTION(sbi).compress_algorithm =
+ COMPRESS_LZORLE;
} else {
kfree(name);
return -EINVAL;
@@ -1330,7 +1332,8 @@ static int f2fs_statfs_project(struct super_block *sb,
limit >>= sb->s_blocksize_bits;
if (limit && buf->f_blocks > limit) {
- curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+ curblock = (dquot->dq_dqb.dqb_curspace +
+ dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
buf->f_blocks = limit;
buf->f_bfree = buf->f_bavail =
(buf->f_blocks > curblock) ?
@@ -1465,6 +1468,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
case COMPRESS_ZSTD:
algtype = "zstd";
break;
+ case COMPRESS_LZORLE:
+ algtype = "lzo-rle";
+ break;
}
seq_printf(seq, ",compress_algorithm=%s", algtype);
@@ -1880,6 +1886,7 @@ skip:
(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
limit_reserve_root(sbi);
+ adjust_unusable_cap_perc(sbi);
*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
return 0;
restore_gc:
@@ -3062,7 +3069,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
FDEV(devi).nr_blkz++;
- FDEV(devi).blkz_seq = f2fs_kzalloc(sbi,
+ FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi,
BITS_TO_LONGS(FDEV(devi).nr_blkz)
* sizeof(unsigned long),
GFP_KERNEL);
@@ -3449,7 +3456,6 @@ try_onemore:
init_rwsem(&sbi->gc_lock);
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
- mutex_init(&sbi->resize_mutex);
init_rwsem(&sbi->node_write);
init_rwsem(&sbi->node_change);
@@ -3460,6 +3466,7 @@ try_onemore:
/* init iostat info */
spin_lock_init(&sbi->iostat_lock);
sbi->iostat_enable = false;
+ sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS;
for (i = 0; i < NR_PAGE_TYPE; i++) {
int n = (i == META) ? 1: NR_TEMP_TYPE;
@@ -3557,6 +3564,7 @@ try_onemore:
sbi->reserved_blocks = 0;
sbi->current_reserved_blocks = 0;
limit_reserve_root(sbi);
+ adjust_unusable_cap_perc(sbi);
for (i = 0; i < NR_INODE_TYPE; i++) {
INIT_LIST_HEAD(&sbi->inode_list[i]);
@@ -3927,7 +3935,12 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_bioset();
if (err)
goto free_bio_enrty_cache;
+ err = f2fs_init_compress_mempool();
+ if (err)
+ goto free_bioset;
return 0;
+free_bioset:
+ f2fs_destroy_bioset();
free_bio_enrty_cache:
f2fs_destroy_bio_entry_cache();
free_post_read:
@@ -3955,6 +3968,7 @@ fail:
static void __exit exit_f2fs_fs(void)
{
+ f2fs_destroy_compress_mempool();
f2fs_destroy_bioset();
f2fs_destroy_bio_entry_cache();
f2fs_destroy_post_read_processing();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 3162f46b3c9b..e877c59b9fdb 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -15,6 +15,7 @@
#include "f2fs.h"
#include "segment.h"
#include "gc.h"
+#include <trace/events/f2fs.h>
static struct proc_dir_entry *f2fs_proc_root;
@@ -372,7 +373,6 @@ out:
return count;
}
-
if (!strcmp(a->attr.name, "iostat_enable")) {
sbi->iostat_enable = !!t;
if (!sbi->iostat_enable)
@@ -380,6 +380,15 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "iostat_period_ms")) {
+ if (t < MIN_IOSTAT_PERIOD_MS || t > MAX_IOSTAT_PERIOD_MS)
+ return -EINVAL;
+ spin_lock(&sbi->iostat_lock);
+ sbi->iostat_period_ms = (unsigned int)t;
+ spin_unlock(&sbi->iostat_lock);
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -538,6 +547,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list);
@@ -545,6 +555,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list);
F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
F2FS_GENERAL_RO_ATTR(dirty_segments);
F2FS_GENERAL_RO_ATTR(free_segments);
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
@@ -618,6 +630,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_idle_interval),
ATTR_LIST(umount_discard_timeout),
ATTR_LIST(iostat_enable),
+ ATTR_LIST(iostat_period_ms),
ATTR_LIST(readdir_ra),
ATTR_LIST(gc_pin_file_thresh),
ATTR_LIST(extension_list),
@@ -625,6 +638,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(inject_rate),
ATTR_LIST(inject_type),
#endif
+ ATTR_LIST(data_io_flag),
+ ATTR_LIST(node_io_flag),
ATTR_LIST(dirty_segments),
ATTR_LIST(free_segments),
ATTR_LIST(unusable),
@@ -754,6 +769,33 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq,
return 0;
}
+void f2fs_record_iostat(struct f2fs_sb_info *sbi)
+{
+ unsigned long long iostat_diff[NR_IO_TYPE];
+ int i;
+
+ if (time_is_after_jiffies(sbi->iostat_next_period))
+ return;
+
+ /* Need double check under the lock */
+ spin_lock(&sbi->iostat_lock);
+ if (time_is_after_jiffies(sbi->iostat_next_period)) {
+ spin_unlock(&sbi->iostat_lock);
+ return;
+ }
+ sbi->iostat_next_period = jiffies +
+ msecs_to_jiffies(sbi->iostat_period_ms);
+
+ for (i = 0; i < NR_IO_TYPE; i++) {
+ iostat_diff[i] = sbi->rw_iostat[i] -
+ sbi->prev_rw_iostat[i];
+ sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
+ }
+ spin_unlock(&sbi->iostat_lock);
+
+ trace_f2fs_iostat(sbi, iostat_diff);
+}
+
static int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
void *offset)
{
@@ -766,33 +808,58 @@ static int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
seq_printf(seq, "time: %-16llu\n", now);
- /* print app IOs */
+ /* print app write IOs */
+ seq_puts(seq, "[WRITE]\n");
seq_printf(seq, "app buffered: %-16llu\n",
- sbi->write_iostat[APP_BUFFERED_IO]);
+ sbi->rw_iostat[APP_BUFFERED_IO]);
seq_printf(seq, "app direct: %-16llu\n",
- sbi->write_iostat[APP_DIRECT_IO]);
+ sbi->rw_iostat[APP_DIRECT_IO]);
seq_printf(seq, "app mapped: %-16llu\n",
- sbi->write_iostat[APP_MAPPED_IO]);
+ sbi->rw_iostat[APP_MAPPED_IO]);
- /* print fs IOs */
+ /* print fs write IOs */
seq_printf(seq, "fs data: %-16llu\n",
- sbi->write_iostat[FS_DATA_IO]);
+ sbi->rw_iostat[FS_DATA_IO]);
seq_printf(seq, "fs node: %-16llu\n",
- sbi->write_iostat[FS_NODE_IO]);
+ sbi->rw_iostat[FS_NODE_IO]);
seq_printf(seq, "fs meta: %-16llu\n",
- sbi->write_iostat[FS_META_IO]);
+ sbi->rw_iostat[FS_META_IO]);
seq_printf(seq, "fs gc data: %-16llu\n",
- sbi->write_iostat[FS_GC_DATA_IO]);
+ sbi->rw_iostat[FS_GC_DATA_IO]);
seq_printf(seq, "fs gc node: %-16llu\n",
- sbi->write_iostat[FS_GC_NODE_IO]);
+ sbi->rw_iostat[FS_GC_NODE_IO]);
seq_printf(seq, "fs cp data: %-16llu\n",
- sbi->write_iostat[FS_CP_DATA_IO]);
+ sbi->rw_iostat[FS_CP_DATA_IO]);
seq_printf(seq, "fs cp node: %-16llu\n",
- sbi->write_iostat[FS_CP_NODE_IO]);
+ sbi->rw_iostat[FS_CP_NODE_IO]);
seq_printf(seq, "fs cp meta: %-16llu\n",
- sbi->write_iostat[FS_CP_META_IO]);
+ sbi->rw_iostat[FS_CP_META_IO]);
+
+ /* print app read IOs */
+ seq_puts(seq, "[READ]\n");
+ seq_printf(seq, "app buffered: %-16llu\n",
+ sbi->rw_iostat[APP_BUFFERED_READ_IO]);
+ seq_printf(seq, "app direct: %-16llu\n",
+ sbi->rw_iostat[APP_DIRECT_READ_IO]);
+ seq_printf(seq, "app mapped: %-16llu\n",
+ sbi->rw_iostat[APP_MAPPED_READ_IO]);
+
+ /* print fs read IOs */
+ seq_printf(seq, "fs data: %-16llu\n",
+ sbi->rw_iostat[FS_DATA_READ_IO]);
+ seq_printf(seq, "fs gc data: %-16llu\n",
+ sbi->rw_iostat[FS_GDATA_READ_IO]);
+ seq_printf(seq, "fs compr_data: %-16llu\n",
+ sbi->rw_iostat[FS_CDATA_READ_IO]);
+ seq_printf(seq, "fs node: %-16llu\n",
+ sbi->rw_iostat[FS_NODE_READ_IO]);
+ seq_printf(seq, "fs meta: %-16llu\n",
+ sbi->rw_iostat[FS_META_READ_IO]);
+
+ /* print other IOs */
+ seq_puts(seq, "[OTHER]\n");
seq_printf(seq, "fs discard: %-16llu\n",
- sbi->write_iostat[FS_DISCARD]);
+ sbi->rw_iostat[FS_DISCARD]);
return 0;
}
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
index e8075fc5b228..789f6aa727fc 100644
--- a/fs/f2fs/trace.h
+++ b/fs/f2fs/trace.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* f2fs IO tracer
*
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 938fcd20565d..416d652774a3 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/f2fs/xattr.h
*
@@ -136,6 +136,7 @@ extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *);
#else
#define f2fs_xattr_handlers NULL
+#define f2fs_listxattr NULL
static inline int f2fs_setxattr(struct inode *inode, int index,
const char *name, const void *value, size_t size,
struct page *page, int flags)
@@ -148,11 +149,6 @@ static inline int f2fs_getxattr(struct inode *inode, int index,
{
return -EOPNOTSUPP;
}
-static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
- size_t buffer_size)
-{
- return -EOPNOTSUPP;
-}
static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { }
#endif
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8ccc97356cb5..02b3c36b3676 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -342,7 +342,7 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
list_add_tail(&req->intr_entry, &fiq->interrupts);
/*
* Pairs with smp_mb() implied by test_and_set_bit()
- * from request_end().
+ * from fuse_request_end().
*/
smp_mb();
if (test_bit(FR_FINISHED, &req->flags)) {
@@ -764,16 +764,15 @@ static int fuse_check_page(struct page *page)
{
if (page_mapcount(page) ||
page->mapping != NULL ||
- page_count(page) != 1 ||
(page->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
- 1 << PG_reclaim))) {
- pr_warn("trying to steal weird page\n");
- pr_warn(" page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+ 1 << PG_reclaim |
+ 1 << PG_waiters))) {
+ dump_page(page, "fuse: trying to steal weird page");
return 1;
}
return 0;
@@ -1977,8 +1976,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct pipe_buffer *ibuf;
struct pipe_buffer *obuf;
- BUG_ON(nbuf >= pipe->ring_size);
- BUG_ON(tail == head);
+ if (WARN_ON(nbuf >= count || tail == head))
+ goto out_free;
+
ibuf = &pipe->bufs[tail & mask];
obuf = &bufs[nbuf];
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index de1e2fde60bd..26f028bc760b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1689,8 +1689,18 @@ static int fuse_getattr(const struct path *path, struct kstat *stat,
struct inode *inode = d_inode(path->dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!fuse_allow_current_process(fc))
+ if (!fuse_allow_current_process(fc)) {
+ if (!request_mask) {
+ /*
+ * If user explicitly requested *nothing* then don't
+ * error out, but return st_dev only.
+ */
+ stat->result_mask = 0;
+ stat->dev = inode->i_sb->s_dev;
+ return 0;
+ }
return -EACCES;
+ }
return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index bac51c32d660..e573b0cd2737 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -357,7 +357,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
struct fuse_writepage_args {
struct fuse_io_args ia;
- struct list_head writepages_entry;
+ struct rb_node writepages_entry;
struct list_head queue_entry;
struct fuse_writepage_args *next;
struct inode *inode;
@@ -366,17 +366,23 @@ struct fuse_writepage_args {
static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
pgoff_t idx_from, pgoff_t idx_to)
{
- struct fuse_writepage_args *wpa;
+ struct rb_node *n;
+
+ n = fi->writepages.rb_node;
- list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
+ while (n) {
+ struct fuse_writepage_args *wpa;
pgoff_t curr_index;
+ wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
WARN_ON(get_fuse_inode(wpa->inode) != fi);
curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
- if (idx_from < curr_index + wpa->ia.ap.num_pages &&
- curr_index <= idx_to) {
+ if (idx_from >= curr_index + wpa->ia.ap.num_pages)
+ n = n->rb_right;
+ else if (idx_to < curr_index)
+ n = n->rb_left;
+ else
return wpa;
- }
}
return NULL;
}
@@ -445,9 +451,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (is_bad_inode(inode))
return -EIO;
- if (fc->no_flush)
- return 0;
-
err = write_inode_now(inode, 1);
if (err)
return err;
@@ -460,6 +463,10 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (err)
return err;
+ err = 0;
+ if (fc->no_flush)
+ goto inval_attr_out;
+
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
inarg.lock_owner = fuse_lock_owner_id(fc, id);
@@ -475,6 +482,14 @@ static int fuse_flush(struct file *file, fl_owner_t id)
fc->no_flush = 1;
err = 0;
}
+
+inval_attr_out:
+ /*
+ * In memory i_blocks is not maintained by fuse, if writeback cache is
+ * enabled, i_blocks from cached attr may not be accurate.
+ */
+ if (!err && fc->writeback_cache)
+ fuse_invalidate_attr(inode);
return err;
}
@@ -712,6 +727,7 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc,
spin_unlock(&io->lock);
ia->ap.args.end = fuse_aio_complete_req;
+ ia->ap.args.may_block = io->should_dirty;
err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
if (err)
fuse_aio_complete_req(fc, &ia->ap.args, err);
@@ -1570,7 +1586,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- list_del(&wpa->writepages_entry);
+ rb_erase(&wpa->writepages_entry, &fi->writepages);
for (i = 0; i < ap->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
@@ -1658,6 +1674,36 @@ __acquires(fi->lock)
}
}
+static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
+{
+ pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
+ pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ WARN_ON(!wpa->ia.ap.num_pages);
+ while (*p) {
+ struct fuse_writepage_args *curr;
+ pgoff_t curr_index;
+
+ parent = *p;
+ curr = rb_entry(parent, struct fuse_writepage_args,
+ writepages_entry);
+ WARN_ON(curr->inode != wpa->inode);
+ curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
+
+ if (idx_from >= curr_index + curr->ia.ap.num_pages)
+ p = &(*p)->rb_right;
+ else if (idx_to < curr_index)
+ p = &(*p)->rb_left;
+ else
+ return (void) WARN_ON(true);
+ }
+
+ rb_link_node(&wpa->writepages_entry, parent, p);
+ rb_insert_color(&wpa->writepages_entry, root);
+}
+
static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
int error)
{
@@ -1676,7 +1722,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
wpa->next = next->next;
next->next = NULL;
next->ia.ff = fuse_file_get(wpa->ia.ff);
- list_add(&next->writepages_entry, &fi->writepages);
+ tree_insert(&fi->writepages, next);
/*
* Skip fuse_flush_writepages() to make it easy to crop requests
@@ -1811,7 +1857,7 @@ static int fuse_writepage_locked(struct page *page)
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fi->lock);
- list_add(&wpa->writepages_entry, &fi->writepages);
+ tree_insert(&fi->writepages, wpa);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
@@ -1923,10 +1969,10 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
WARN_ON(new_ap->num_pages != 0);
spin_lock(&fi->lock);
- list_del(&new_wpa->writepages_entry);
+ rb_erase(&new_wpa->writepages_entry, &fi->writepages);
old_wpa = fuse_find_writeback(fi, page->index, page->index);
if (!old_wpa) {
- list_add(&new_wpa->writepages_entry, &fi->writepages);
+ tree_insert(&fi->writepages, new_wpa);
spin_unlock(&fi->lock);
return false;
}
@@ -2041,7 +2087,7 @@ static int fuse_writepages_fill(struct page *page,
wpa->inode = inode;
spin_lock(&fi->lock);
- list_add(&wpa->writepages_entry, &fi->writepages);
+ tree_insert(&fi->writepages, wpa);
spin_unlock(&fi->lock);
data->wpa = wpa;
@@ -3235,13 +3281,11 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
- if (fc->writeback_cache) {
- inode_lock(inode_in);
- err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
- inode_unlock(inode_in);
- if (err)
- return err;
- }
+ inode_lock(inode_in);
+ err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
+ inode_unlock(inode_in);
+ if (err)
+ return err;
inode_lock(inode_out);
@@ -3249,11 +3293,27 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (err)
goto out;
- if (fc->writeback_cache) {
- err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
- if (err)
- goto out;
- }
+ /*
+ * Write out dirty pages in the destination file before sending the COPY
+ * request to userspace. After the request is completed, truncate off
+ * pages (including partial ones) from the cache that have been copied,
+ * since these contain stale data at that point.
+ *
+ * This should be mostly correct, but if the COPY writes to partial
+ * pages (at the start or end) and the parts not covered by the COPY are
+ * written through a memory map after calling fuse_writeback_range(),
+ * then these partial page modifications will be lost on truncation.
+ *
+ * It is unlikely that someone would rely on such mixed style
+ * modifications. Yet this does give less guarantees than if the
+ * copying was performed with write(2).
+ *
+ * To fix this a i_mmap_sem style lock could be used to prevent new
+ * faults while the copy is ongoing.
+ */
+ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
+ if (err)
+ goto out;
if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
@@ -3274,6 +3334,10 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (err)
goto out;
+ truncate_inode_pages_range(inode_out->i_mapping,
+ ALIGN_DOWN(pos_out, PAGE_SIZE),
+ ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
+
if (fc->writeback_cache) {
fuse_write_update_size(inode_out, pos_out + outarg.size);
file_update_time(file_out);
@@ -3351,5 +3415,5 @@ void fuse_init_file_inode(struct inode *inode)
INIT_LIST_HEAD(&fi->queued_writes);
fi->writectr = 0;
init_waitqueue_head(&fi->page_waitq);
- INIT_LIST_HEAD(&fi->writepages);
+ fi->writepages = RB_ROOT;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ca344bf71404..740a8a7d7ae6 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -111,7 +111,7 @@ struct fuse_inode {
wait_queue_head_t page_waitq;
/* List of writepage requestst (pending or sent) */
- struct list_head writepages;
+ struct rb_root writepages;
};
/* readdir cache (directory only) */
@@ -249,6 +249,7 @@ struct fuse_args {
bool out_argvar:1;
bool page_zeroing:1;
bool page_replace:1;
+ bool may_block:1;
struct fuse_in_arg in_args[3];
struct fuse_arg out_args[2];
void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 95d712d44ca1..5b4aebf5821f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -321,6 +321,8 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
loff_t offset, loff_t len)
{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_inode *fi;
struct inode *inode;
pgoff_t pg_start;
pgoff_t pg_end;
@@ -329,6 +331,11 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
if (!inode)
return -ENOENT;
+ fi = get_fuse_inode(inode);
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ spin_unlock(&fi->lock);
+
fuse_invalidate_attr(inode);
forget_all_cached_acls(inode);
if (offset >= 0) {
@@ -1113,7 +1120,7 @@ EXPORT_SYMBOL_GPL(fuse_dev_free);
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
{
- struct fuse_dev *fud;
+ struct fuse_dev *fud = NULL;
struct fuse_conn *fc = get_fuse_conn_super(sb);
struct inode *root;
struct dentry *root_dentry;
@@ -1155,9 +1162,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
if (sb->s_user_ns != &init_user_ns)
sb->s_xattr = fuse_no_acl_xattr_handlers;
- fud = fuse_dev_alloc_install(fc);
- if (!fud)
- goto err;
+ if (ctx->fudptr) {
+ err = -ENOMEM;
+ fud = fuse_dev_alloc_install(fc);
+ if (!fud)
+ goto err;
+ }
fc->dev = sb->s_dev;
fc->sb = sb;
@@ -1191,7 +1201,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
mutex_lock(&fuse_mutex);
err = -EINVAL;
- if (*ctx->fudptr)
+ if (ctx->fudptr && *ctx->fudptr)
goto err_unlock;
err = fuse_ctl_add_conn(fc);
@@ -1200,7 +1210,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
list_add_tail(&fc->entry, &fuse_conn_list);
sb->s_root = root_dentry;
- *ctx->fudptr = fud;
+ if (ctx->fudptr)
+ *ctx->fudptr = fud;
mutex_unlock(&fuse_mutex);
return 0;
@@ -1208,7 +1219,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
mutex_unlock(&fuse_mutex);
dput(root_dentry);
err_dev_free:
- fuse_dev_free(fud);
+ if (fud)
+ fuse_dev_free(fud);
err:
return err;
}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index bade74768903..4c4ef5d69298 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -60,6 +60,12 @@ struct virtio_fs_forget {
struct virtio_fs_forget_req req;
};
+struct virtio_fs_req_work {
+ struct fuse_req *req;
+ struct virtio_fs_vq *fsvq;
+ struct work_struct done_work;
+};
+
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
struct fuse_req *req, bool in_flight);
@@ -485,19 +491,67 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
}
/* Work function for request completion */
+static void virtio_fs_request_complete(struct fuse_req *req,
+ struct virtio_fs_vq *fsvq)
+{
+ struct fuse_pqueue *fpq = &fsvq->fud->pq;
+ struct fuse_conn *fc = fsvq->fud->fc;
+ struct fuse_args *args;
+ struct fuse_args_pages *ap;
+ unsigned int len, i, thislen;
+ struct page *page;
+
+ /*
+ * TODO verify that server properly follows FUSE protocol
+ * (oh.uniq, oh.len)
+ */
+ args = req->args;
+ copy_args_from_argbuf(args, req);
+
+ if (args->out_pages && args->page_zeroing) {
+ len = args->out_args[args->out_numargs - 1].size;
+ ap = container_of(args, typeof(*ap), args);
+ for (i = 0; i < ap->num_pages; i++) {
+ thislen = ap->descs[i].length;
+ if (len < thislen) {
+ WARN_ON(ap->descs[i].offset);
+ page = ap->pages[i];
+ zero_user_segment(page, len, thislen);
+ len = 0;
+ } else {
+ len -= thislen;
+ }
+ }
+ }
+
+ spin_lock(&fpq->lock);
+ clear_bit(FR_SENT, &req->flags);
+ spin_unlock(&fpq->lock);
+
+ fuse_request_end(fc, req);
+ spin_lock(&fsvq->lock);
+ dec_in_flight_req(fsvq);
+ spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_complete_req_work(struct work_struct *work)
+{
+ struct virtio_fs_req_work *w =
+ container_of(work, typeof(*w), done_work);
+
+ virtio_fs_request_complete(w->req, w->fsvq);
+ kfree(w);
+}
+
static void virtio_fs_requests_done_work(struct work_struct *work)
{
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
done_work);
struct fuse_pqueue *fpq = &fsvq->fud->pq;
- struct fuse_conn *fc = fsvq->fud->fc;
struct virtqueue *vq = fsvq->vq;
struct fuse_req *req;
- struct fuse_args_pages *ap;
struct fuse_req *next;
- struct fuse_args *args;
- unsigned int len, i, thislen;
- struct page *page;
+ unsigned int len;
LIST_HEAD(reqs);
/* Collect completed requests off the virtqueue */
@@ -515,38 +569,20 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
/* End requests */
list_for_each_entry_safe(req, next, &reqs, list) {
- /*
- * TODO verify that server properly follows FUSE protocol
- * (oh.uniq, oh.len)
- */
- args = req->args;
- copy_args_from_argbuf(args, req);
-
- if (args->out_pages && args->page_zeroing) {
- len = args->out_args[args->out_numargs - 1].size;
- ap = container_of(args, typeof(*ap), args);
- for (i = 0; i < ap->num_pages; i++) {
- thislen = ap->descs[i].length;
- if (len < thislen) {
- WARN_ON(ap->descs[i].offset);
- page = ap->pages[i];
- zero_user_segment(page, len, thislen);
- len = 0;
- } else {
- len -= thislen;
- }
- }
- }
-
- spin_lock(&fpq->lock);
- clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
- spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
- spin_lock(&fsvq->lock);
- dec_in_flight_req(fsvq);
- spin_unlock(&fsvq->lock);
+ /* blocking async request completes in a worker context */
+ if (req->args->may_block) {
+ struct virtio_fs_req_work *w;
+
+ w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL);
+ INIT_WORK(&w->done_work, virtio_fs_complete_req_work);
+ w->fsvq = fsvq;
+ w->req = req;
+ schedule_work(&w->done_work);
+ } else {
+ virtio_fs_request_complete(req, fsvq);
+ }
}
}
@@ -1067,7 +1103,7 @@ static int virtio_fs_fill_super(struct super_block *sb)
err = -ENOMEM;
/* Allocate fuse_dev for hiprio and notification queues */
- for (i = 0; i < VQ_REQUEST; i++) {
+ for (i = 0; i < fs->nvqs; i++) {
struct virtio_fs_vq *fsvq = &fs->vqs[i];
fsvq->fud = fuse_dev_alloc();
@@ -1075,18 +1111,15 @@ static int virtio_fs_fill_super(struct super_block *sb)
goto err_free_fuse_devs;
}
- ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
+ /* virtiofs allocates and installs its own fuse devices */
+ ctx.fudptr = NULL;
err = fuse_fill_super_common(sb, &ctx);
if (err < 0)
goto err_free_fuse_devs;
- fc = fs->vqs[VQ_REQUEST].fud->fc;
-
for (i = 0; i < fs->nvqs; i++) {
struct virtio_fs_vq *fsvq = &fs->vqs[i];
- if (i == VQ_REQUEST)
- continue; /* already initialized */
fuse_dev_install(fsvq->fud, fc);
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f3420a643b4f..ef5313f9c78f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -187,7 +187,7 @@ out:
}
/*
- * Called under down_write(mmap_sem).
+ * Called under mmap_write_lock(mm).
*/
#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 4023c9846860..0b65a912b036 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -10,7 +10,6 @@
#include <linux/errno.h>
#include <linux/sched/signal.h>
#include <linux/mm.h>
-#include <linux/mmu_context.h>
#include <linux/sched/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
@@ -112,6 +111,7 @@ struct io_wq {
unsigned long state;
free_work_fn *free_work;
+ io_wq_work_fn *do_work;
struct task_struct *manager;
struct user_struct *user;
@@ -170,8 +170,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
dropped_lock = true;
}
__set_current_state(TASK_RUNNING);
- set_fs(KERNEL_DS);
- unuse_mm(worker->mm);
+ kthread_unuse_mm(worker->mm);
mmput(worker->mm);
worker->mm = NULL;
}
@@ -418,18 +417,15 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
{
if (worker->mm) {
- unuse_mm(worker->mm);
+ kthread_unuse_mm(worker->mm);
mmput(worker->mm);
worker->mm = NULL;
}
- if (!work->mm) {
- set_fs(KERNEL_DS);
+ if (!work->mm)
return;
- }
+
if (mmget_not_zero(work->mm)) {
- use_mm(work->mm);
- if (!worker->mm)
- set_fs(USER_DS);
+ kthread_use_mm(work->mm);
worker->mm = work->mm;
/* hang on to this mm */
work->mm = NULL;
@@ -528,7 +524,7 @@ get_next:
hash = io_get_work_hash(work);
linked = old_work = work;
- linked->func(&linked);
+ wq->do_work(&linked);
linked = (old_work == linked) ? NULL : linked;
work = next_hashed;
@@ -785,7 +781,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ wq->do_work(&work);
work = (work == old_work) ? NULL : work;
wq->free_work(old_work);
} while (work);
@@ -1023,7 +1019,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret = -ENOMEM, node;
struct io_wq *wq;
- if (WARN_ON_ONCE(!data->free_work))
+ if (WARN_ON_ONCE(!data->free_work || !data->do_work))
return ERR_PTR(-EINVAL);
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
@@ -1037,6 +1033,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
}
wq->free_work = data->free_work;
+ wq->do_work = data->do_work;
/* caller must already hold a reference to this */
wq->user = data->user;
@@ -1093,7 +1090,7 @@ err:
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
- if (data->free_work != wq->free_work)
+ if (data->free_work != wq->free_work || data->do_work != wq->do_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 5ba12de7572f..8e138fa88b9f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -85,7 +85,6 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
- void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
const struct cred *creds;
@@ -94,11 +93,6 @@ struct io_wq_work {
pid_t task_pid;
};
-#define INIT_IO_WORK(work, _func) \
- do { \
- *(work) = (struct io_wq_work){ .func = _func }; \
- } while (0) \
-
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
{
if (!work->list.next)
@@ -108,10 +102,12 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
}
typedef void (free_work_fn)(struct io_wq_work *);
+typedef void (io_wq_work_fn)(struct io_wq_work **);
struct io_wq_data {
struct user_struct *user;
+ io_wq_work_fn *do_work;
free_work_fn *free_work;
};
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9d4bd0d3a080..155f3d830ddb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -55,7 +55,6 @@
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
-#include <linux/mmu_context.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/kthread.h>
@@ -529,7 +528,6 @@ enum {
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
- REQ_F_IOPOLL_COMPLETED_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
@@ -541,6 +539,8 @@ enum {
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
+ REQ_F_QUEUE_TIMEOUT_BIT,
+ REQ_F_WORK_INITIALIZED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -572,8 +572,6 @@ enum {
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
- /* polled IO has completed */
- REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
/* has linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* timeout request */
@@ -596,6 +594,10 @@ enum {
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
+ /* needs to queue linked timeout */
+ REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
+ /* io_wq_work is initialized */
+ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
};
struct async_poll {
@@ -634,6 +636,8 @@ struct io_kiocb {
struct io_async_ctx *io;
int cflags;
u8 opcode;
+ /* polled IO has completed */
+ u8 iopoll_completed;
u16 buf_index;
@@ -698,6 +702,8 @@ struct io_op_def {
unsigned needs_mm : 1;
/* needs req->file assigned */
unsigned needs_file : 1;
+ /* don't fail if file grab fails */
+ unsigned needs_file_no_error : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
@@ -804,6 +810,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_fs = 1,
},
[IORING_OP_CLOSE] = {
+ .needs_file = 1,
+ .needs_file_no_error = 1,
.file_table = 1,
},
[IORING_OP_FILES_UPDATE] = {
@@ -904,6 +912,19 @@ EXPORT_SYMBOL(io_uring_get_socket);
static void io_file_put_work(struct work_struct *work);
+/*
+ * Note: must call io_req_init_async() for the first time you
+ * touch any members of io_wq_work.
+ */
+static inline void io_req_init_async(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ return;
+
+ memset(&req->work, 0, sizeof(req->work));
+ req->flags |= REQ_F_WORK_INITIALIZED;
+}
+
static inline bool io_async_submit(struct io_ring_ctx *ctx)
{
return ctx->flags & IORING_SETUP_SQPOLL;
@@ -1030,6 +1051,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
static inline void io_req_work_drop_env(struct io_kiocb *req)
{
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ return;
+
if (req->work.mm) {
mmdrop(req->work.mm);
req->work.mm = NULL;
@@ -1576,16 +1600,6 @@ static void io_free_req(struct io_kiocb *req)
io_queue_async_work(nxt);
}
-static void io_link_work_cb(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *link;
-
- link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- io_queue_linked_timeout(link);
- io_wq_submit_work(workptr);
-}
-
static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
{
struct io_kiocb *link;
@@ -1597,7 +1611,7 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
*workptr = &nxt->work;
link = io_prep_linked_timeout(nxt);
if (link)
- nxt->work.func = io_link_work_cb;
+ nxt->flags |= REQ_F_QUEUE_TIMEOUT;
}
/*
@@ -1782,7 +1796,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
- if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+ if (READ_ONCE(req->iopoll_completed)) {
list_move_tail(&req->list, &done);
continue;
}
@@ -1963,7 +1977,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
req_set_fail_links(req);
req->result = res;
if (res != -EAGAIN)
- req->flags |= REQ_F_IOPOLL_COMPLETED;
+ WRITE_ONCE(req->iopoll_completed, 1);
}
/*
@@ -1996,7 +2010,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* For fast devices, IO may have already completed. If it has, add
* it to the front so we find it first.
*/
- if (req->flags & REQ_F_IOPOLL_COMPLETED)
+ if (READ_ONCE(req->iopoll_completed))
list_add(&req->list, &ctx->poll_list);
else
list_add_tail(&req->list, &ctx->poll_list);
@@ -2064,6 +2078,10 @@ static bool io_file_supports_async(struct file *file, int rw)
if (S_ISREG(mode) && file->f_op != &io_uring_fops)
return true;
+ /* any ->read/write should understand O_NONBLOCK */
+ if (file->f_flags & O_NONBLOCK)
+ return true;
+
if (!(file->f_mode & FMODE_NOWAIT))
return false;
@@ -2106,8 +2124,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_ioprio = get_current_ioprio();
/* don't allow async punt if RWF_NOWAIT was requested */
- if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- (req->file->f_flags & O_NONBLOCK))
+ if (kiocb->ki_flags & IOCB_NOWAIT)
req->flags |= REQ_F_NOWAIT;
if (force_nonblock)
@@ -2121,6 +2138,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->result = 0;
+ req->iopoll_completed = 0;
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -2359,8 +2377,14 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
bool needs_lock)
{
- if (req->flags & REQ_F_BUFFER_SELECTED)
+ if (req->flags & REQ_F_BUFFER_SELECTED) {
+ struct io_buffer *kbuf;
+
+ kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+ iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+ iov[0].iov_len = kbuf->len;
return 0;
+ }
if (!req->rw.len)
return 0;
else if (req->rw.len > 1)
@@ -2742,7 +2766,8 @@ copy_iov:
if (ret)
goto out_free;
/* any defer here is final, must blocking retry */
- if (!file_can_poll(req->file))
+ if (!(req->flags & REQ_F_NOWAIT) &&
+ !file_can_poll(req->file))
req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
@@ -2762,6 +2787,8 @@ static int __io_splice_prep(struct io_kiocb *req,
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
sp->file_in = NULL;
sp->len = READ_ONCE(sqe->len);
@@ -2776,8 +2803,14 @@ static int __io_splice_prep(struct io_kiocb *req,
return ret;
req->flags |= REQ_F_NEED_CLEANUP;
- if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+ if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
+ /*
+ * Splice operation will be punted aync, and here need to
+ * modify io_wq_work.flags, so initialize io_wq_work firstly.
+ */
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_UNBOUND;
+ }
return 0;
}
@@ -2886,23 +2919,15 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static bool io_req_cancelled(struct io_kiocb *req)
-{
- if (req->work.flags & IO_WQ_WORK_CANCEL) {
- req_set_fail_links(req);
- io_cqring_add_event(req, -ECANCELED);
- io_put_req(req);
- return true;
- }
-
- return false;
-}
-
-static void __io_fsync(struct io_kiocb *req)
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
{
loff_t end = req->sync.off + req->sync.len;
int ret;
+ /* fsync always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
@@ -2910,58 +2935,16 @@ static void __io_fsync(struct io_kiocb *req)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_fsync(req);
- io_steal_work(req, workptr);
-}
-
-static int io_fsync(struct io_kiocb *req, bool force_nonblock)
-{
- /* fsync always requires a blocking context */
- if (force_nonblock) {
- req->work.func = io_fsync_finish;
- return -EAGAIN;
- }
- __io_fsync(req);
return 0;
}
-static void __io_fallocate(struct io_kiocb *req)
-{
- int ret;
-
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
- ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
- req->sync.len);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
-}
-
-static void io_fallocate_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_fallocate(req);
- io_steal_work(req, workptr);
-}
-
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
@@ -2972,66 +2955,74 @@ static int io_fallocate_prep(struct io_kiocb *req,
static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
{
+ int ret;
+
/* fallocate always requiring blocking context */
- if (force_nonblock) {
- req->work.func = io_fallocate_finish;
+ if (force_nonblock)
return -EAGAIN;
- }
- __io_fallocate(req);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+ ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
+ req->sync.len);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req(req);
return 0;
}
-static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
const char __user *fname;
int ret;
- if (sqe->ioprio || sqe->buf_index)
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
+ if (unlikely(sqe->ioprio || sqe->buf_index))
+ return -EINVAL;
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
- req->open.dfd = READ_ONCE(sqe->fd);
- req->open.how.mode = READ_ONCE(sqe->len);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->open.how.flags = READ_ONCE(sqe->open_flags);
- if (force_o_largefile())
+ /* open.how should be already initialised */
+ if (!(req->open.how.flags & O_PATH) && force_o_largefile())
req->open.how.flags |= O_LARGEFILE;
+ req->open.dfd = READ_ONCE(sqe->fd);
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
ret = PTR_ERR(req->open.filename);
req->open.filename = NULL;
return ret;
}
-
req->open.nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
+static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ u64 flags, mode;
+
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
+ mode = READ_ONCE(sqe->len);
+ flags = READ_ONCE(sqe->open_flags);
+ req->open.how = build_open_how(flags, mode);
+ return __io_openat_prep(req, sqe);
+}
+
static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct open_how __user *how;
- const char __user *fname;
size_t len;
int ret;
- if (sqe->ioprio || sqe->buf_index)
- return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
- return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
-
- req->open.dfd = READ_ONCE(sqe->fd);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
-
if (len < OPEN_HOW_SIZE_VER0)
return -EINVAL;
@@ -3040,19 +3031,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (ret)
return ret;
- if (!(req->open.how.flags & O_PATH) && force_o_largefile())
- req->open.how.flags |= O_LARGEFILE;
-
- req->open.filename = getname(fname);
- if (IS_ERR(req->open.filename)) {
- ret = PTR_ERR(req->open.filename);
- req->open.filename = NULL;
- return ret;
- }
-
- req->open.nofile = rlimit(RLIMIT_NOFILE);
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
+ return __io_openat_prep(req, sqe);
}
static int io_openat2(struct io_kiocb *req, bool force_nonblock)
@@ -3092,7 +3071,6 @@ err:
static int io_openat(struct io_kiocb *req, bool force_nonblock)
{
- req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
return io_openat2(req, force_nonblock);
}
@@ -3181,7 +3159,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
p->addr = READ_ONCE(sqe->addr);
p->len = READ_ONCE(sqe->len);
- if (!access_ok(u64_to_user_ptr(p->addr), p->len))
+ if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group);
@@ -3259,6 +3237,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#if defined(CONFIG_EPOLL)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
req->epoll.op = READ_ONCE(sqe->len);
@@ -3303,6 +3283,8 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
if (sqe->ioprio || sqe->buf_index || sqe->off)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->madvise.addr = READ_ONCE(sqe->addr);
req->madvise.len = READ_ONCE(sqe->len);
@@ -3337,6 +3319,8 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (sqe->ioprio || sqe->buf_index || sqe->addr)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->fadvise.offset = READ_ONCE(sqe->off);
req->fadvise.len = READ_ONCE(sqe->len);
@@ -3370,6 +3354,8 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
@@ -3410,10 +3396,14 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
/*
* If we queue this for async, it must not be cancellable. That would
- * leave the 'file' in an undeterminate state.
+ * leave the 'file' in an undeterminate state, and here need to modify
+ * io_wq_work.flags, so initialize io_wq_work firstly.
*/
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_NO_CANCEL;
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
@@ -3421,53 +3411,41 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
- return 0;
-}
-
-/* only called when __close_fd_get_file() is done */
-static void __io_close_finish(struct io_kiocb *req)
-{
- int ret;
-
- ret = filp_close(req->close.put_file, req->work.files);
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- fput(req->close.put_file);
- io_put_req(req);
-}
-
-static void io_close_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ if ((req->file && req->file->f_op == &io_uring_fops) ||
+ req->close.fd == req->ctx->ring_fd)
+ return -EBADF;
- /* not cancellable, don't do io_req_cancelled() */
- __io_close_finish(req);
- io_steal_work(req, workptr);
+ req->close.put_file = NULL;
+ return 0;
}
static int io_close(struct io_kiocb *req, bool force_nonblock)
{
+ struct io_close *close = &req->close;
int ret;
- req->close.put_file = NULL;
- ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
- if (ret < 0)
- return (ret == -ENOENT) ? -EBADF : ret;
+ /* might be already done during nonblock submission */
+ if (!close->put_file) {
+ ret = __close_fd_get_file(close->fd, &close->put_file);
+ if (ret < 0)
+ return (ret == -ENOENT) ? -EBADF : ret;
+ }
/* if the file has a flush method, be safe and punt to async */
- if (req->close.put_file->f_op->flush && force_nonblock) {
+ if (close->put_file->f_op->flush && force_nonblock) {
/* avoid grabbing files - we don't need the files */
req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
- req->work.func = io_close_finish;
return -EAGAIN;
}
- /*
- * No ->flush(), safely close from here and just punt the
- * fput() to async context.
- */
- __io_close_finish(req);
+ /* No ->flush() or already async, safely close from here */
+ ret = filp_close(close->put_file, req->work.files);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ fput(close->put_file);
+ close->put_file = NULL;
+ io_put_req(req);
return 0;
}
@@ -3489,38 +3467,20 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static void __io_sync_file_range(struct io_kiocb *req)
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
{
int ret;
+ /* sync_file_range always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
-}
-
-
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_sync_file_range(req);
- io_steal_work(req, workptr);
-}
-
-static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
-{
- /* sync_file_range always requires a blocking context */
- if (force_nonblock) {
- req->work.func = io_sync_file_range_finish;
- return -EAGAIN;
- }
-
- __io_sync_file_range(req);
return 0;
}
@@ -3546,6 +3506,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_async_ctx *io = req->io;
int ret;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -3575,9 +3538,6 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
struct socket *sock;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_async_ctx io;
@@ -3631,9 +3591,6 @@ static int io_send(struct io_kiocb *req, bool force_nonblock)
struct socket *sock;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
@@ -3786,6 +3743,9 @@ static int io_recvmsg_prep(struct io_kiocb *req,
struct io_async_ctx *io = req->io;
int ret;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -3814,9 +3774,6 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
struct socket *sock;
int ret, cflags = 0;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_buffer *kbuf;
@@ -3878,9 +3835,6 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock)
struct socket *sock;
int ret, cflags = 0;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
@@ -3948,49 +3902,30 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int __io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
{
struct io_accept *accept = &req->accept;
- unsigned file_flags;
+ unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
int ret;
- file_flags = force_nonblock ? O_NONBLOCK : 0;
+ if (req->file->f_flags & O_NONBLOCK)
+ req->flags |= REQ_F_NOWAIT;
+
ret = __sys_accept4_file(req->file, file_flags, accept->addr,
accept->addr_len, accept->flags,
accept->nofile);
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
req_set_fail_links(req);
+ }
io_cqring_add_event(req, ret);
io_put_req(req);
return 0;
}
-static void io_accept_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_accept(req, false);
- io_steal_work(req, workptr);
-}
-
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
-{
- int ret;
-
- ret = __io_accept(req, force_nonblock);
- if (ret == -EAGAIN && force_nonblock) {
- req->work.func = io_accept_finish;
- return -EAGAIN;
- }
- return 0;
-}
-
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
@@ -4329,7 +4264,8 @@ static void io_async_task_func(struct callback_head *cb)
spin_unlock_irq(&ctx->completion_lock);
/* restore ->work in case we need to retry again */
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
kfree(apoll);
if (!canceled) {
@@ -4426,7 +4362,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
return false;
req->flags |= REQ_F_POLLED;
- memcpy(&apoll->work, &req->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&apoll->work, &req->work, sizeof(req->work));
had_io = req->io != NULL;
get_task_struct(current);
@@ -4451,7 +4388,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
if (!had_io)
io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
kfree(apoll);
return false;
}
@@ -4496,7 +4434,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
* io_req_work_drop_env below when dropping the
* final reference.
*/
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work,
+ sizeof(req->work));
kfree(apoll);
}
}
@@ -4945,6 +4885,8 @@ static int io_req_defer_prep(struct io_kiocb *req,
if (!sqe)
return 0;
+ io_req_init_async(req);
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (unlikely(ret))
@@ -5382,12 +5324,26 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
+static void io_arm_async_linked_timeout(struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+
+ /* link head's timeout is queued in io_queue_async_work() */
+ if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
+ return;
+
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ io_queue_linked_timeout(link);
+}
+
static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
int ret = 0;
+ io_arm_async_linked_timeout(req);
+
/* if NO_CANCEL is set, we must still run the work */
if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
IO_WQ_WORK_CANCEL) {
@@ -5438,19 +5394,20 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
file = io_file_from_index(ctx, fd);
- if (!file)
- return -EBADF;
- req->fixed_file_refs = ctx->file_data->cur_refs;
- percpu_ref_get(req->fixed_file_refs);
+ if (file) {
+ req->fixed_file_refs = ctx->file_data->cur_refs;
+ percpu_ref_get(req->fixed_file_refs);
+ }
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
- if (unlikely(!file))
- return -EBADF;
}
- *out_file = file;
- return 0;
+ if (file || io_op_defs[req->opcode].needs_file_no_error) {
+ *out_file = file;
+ return 0;
+ }
+ return -EBADF;
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
@@ -5584,7 +5541,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
again:
linked_timeout = io_prep_linked_timeout(req);
- if (req->work.creds && req->work.creds != current_cred()) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+ req->work.creds != current_cred()) {
if (old_creds)
revert_creds(old_creds);
if (old_creds == req->work.creds)
@@ -5607,6 +5565,8 @@ again:
goto exit;
}
punt:
+ io_req_init_async(req);
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (ret)
@@ -5859,7 +5819,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
refcount_set(&req->refs, 2);
req->task = NULL;
req->result = 0;
- INIT_IO_WORK(&req->work, io_wq_submit_work);
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
@@ -5867,7 +5826,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (io_op_defs[req->opcode].needs_mm && !current->mm) {
if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
return -EFAULT;
- use_mm(ctx->sqo_mm);
+ kthread_use_mm(ctx->sqo_mm);
}
sqe_flags = READ_ONCE(sqe->flags);
@@ -5881,6 +5840,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
id = READ_ONCE(sqe->personality);
if (id) {
+ io_req_init_async(req);
req->work.creds = idr_find(&ctx->personality_idr, id);
if (unlikely(!req->work.creds))
return -EINVAL;
@@ -5981,7 +5941,7 @@ static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
struct mm_struct *mm = current->mm;
if (mm) {
- unuse_mm(mm);
+ kthread_unuse_mm(mm);
mmput(mm);
}
}
@@ -5990,15 +5950,12 @@ static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
const struct cred *old_cred;
- mm_segment_t old_fs;
DEFINE_WAIT(wait);
unsigned long timeout;
int ret = 0;
complete(&ctx->sq_thread_comp);
- old_fs = get_fs();
- set_fs(USER_DS);
old_cred = override_creds(ctx->creds);
timeout = jiffies + ctx->sq_thread_idle;
@@ -6103,7 +6060,6 @@ static int io_sq_thread(void *data)
if (current->task_works)
task_work_run();
- set_fs(old_fs);
io_sq_thread_drop_mm(ctx);
revert_creds(old_cred);
@@ -6879,6 +6835,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
data.user = ctx->user;
data.free_work = io_free_work;
+ data.do_work = io_wq_submit_work;
if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
/* Do QD, or 4 * CPUS, whatever is smallest */
@@ -7160,8 +7117,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
ret = 0;
if (!pages || nr_pages > got_pages) {
- kfree(vmas);
- kfree(pages);
+ kvfree(vmas);
+ kvfree(pages);
pages = kvmalloc_array(nr_pages, sizeof(struct page *),
GFP_KERNEL);
vmas = kvmalloc_array(nr_pages,
@@ -7186,7 +7143,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
}
ret = 0;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
pret = pin_user_pages(ubuf, nr_pages,
FOLL_WRITE | FOLL_LONGTERM,
pages, vmas);
@@ -7204,7 +7161,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
} else {
ret = pret < 0 ? pret : -EFAULT;
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (ret) {
/*
* if we did partial map, or found file backed vmas,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index a1ed7620fbac..bcfc288dba3f 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -870,7 +870,7 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap, struct iomap *srcmap)
{
long status = 0;
- ssize_t written = 0;
+ loff_t written = 0;
/* don't bother with blocks that are not shared to start with */
if (!(iomap->flags & IOMAP_F_SHARED))
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a49d0e670ddf..e4944436e733 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1140,6 +1140,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
init_waitqueue_head(&journal->j_wait_reserved);
+ mutex_init(&journal->j_abort_mutex);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -1402,7 +1403,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
printk(KERN_ERR "JBD2: Error %d detected when updating "
"journal superblock for %s.\n", ret,
journal->j_devname);
- jbd2_journal_abort(journal, ret);
+ if (!is_journal_aborted(journal))
+ jbd2_journal_abort(journal, ret);
}
return ret;
@@ -2154,6 +2156,13 @@ void jbd2_journal_abort(journal_t *journal, int errno)
transaction_t *transaction;
/*
+ * Lock the aborting procedure until everything is done, this avoid
+ * races between filesystem's error handling flow (e.g. ext4_abort()),
+ * ensure panic after the error info is written into journal's
+ * superblock.
+ */
+ mutex_lock(&journal->j_abort_mutex);
+ /*
* ESHUTDOWN always takes precedence because a file system check
* caused by any other journal abort error is not required after
* a shutdown triggered.
@@ -2167,6 +2176,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
journal->j_errno = errno;
jbd2_journal_update_sb_errno(journal);
}
+ mutex_unlock(&journal->j_abort_mutex);
return;
}
@@ -2188,10 +2198,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
* layer could realise that a filesystem check is needed.
*/
jbd2_journal_update_sb_errno(journal);
-
- write_lock(&journal->j_state_lock);
- journal->j_flags |= JBD2_REC_ERR;
- write_unlock(&journal->j_state_lock);
+ mutex_unlock(&journal->j_abort_mutex);
}
/**
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 0637271f3770..8ff4d1a1e774 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -259,7 +259,7 @@ struct jffs2_full_dirent
uint32_t ino; /* == zero for unlink */
unsigned int nhash;
unsigned char type;
- unsigned char name[0];
+ unsigned char name[];
};
/*
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index 60207a2ae952..e4131cb1f1d4 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -61,7 +61,7 @@ struct jffs2_sum_dirent_flash
jint32_t ino; /* == zero for unlink */
uint8_t nsize; /* dirent name size */
uint8_t type; /* dirent type */
- uint8_t name[0]; /* dirent name */
+ uint8_t name[]; /* dirent name */
} __attribute__((packed));
struct jffs2_sum_xattr_flash
@@ -117,7 +117,7 @@ struct jffs2_sum_dirent_mem
jint32_t ino; /* == zero for unlink */
uint8_t nsize; /* dirent name size */
uint8_t type; /* dirent type */
- uint8_t name[0]; /* dirent name */
+ uint8_t name[]; /* dirent name */
} __attribute__((packed));
struct jffs2_sum_xattr_mem
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index fd6ddfe4cd94..06b342d8462b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -652,9 +652,9 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
* The following is done to give a different lockdep key to
* @of->mutex for files which implement mmap. This is a rather
* crude way to avoid false positive lockdep warning around
- * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
+ * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and
* reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
- * which mm->mmap_sem nests, while holding @of->mutex. As each
+ * which mm->mmap_lock nests, while holding @of->mutex. As each
* open file has a separate mutex, it's okay as long as those don't
* happen on the same file. At this point, we can't easily give
* each file a separate locking class. Let's differentiate on
diff --git a/fs/locks.c b/fs/locks.c
index 6fd1f6e83178..7df0f9fa66f4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1557,6 +1557,9 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
{
bool rc;
+ if (lease->fl_lmops->lm_breaker_owns_lease
+ && lease->fl_lmops->lm_breaker_owns_lease(lease))
+ return false;
if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) {
rc = false;
goto trace;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6d499ab254b7..f30ed401cc6d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -684,9 +684,6 @@ bool __is_local_mountpoint(struct dentry *dentry)
struct mount *mnt;
bool is_covered = false;
- if (!d_mountpoint(dentry))
- goto out;
-
down_read(&namespace_sem);
lock_ns_list(ns);
list_for_each_entry(mnt, &ns->list, mnt_list) {
@@ -698,7 +695,7 @@ bool __is_local_mountpoint(struct dentry *dentry)
}
unlock_ns_list(ns);
up_read(&namespace_sem);
-out:
+
return is_covered;
}
@@ -1937,6 +1934,9 @@ struct vfsmount *clone_private_mount(const struct path *path)
if (IS_ERR(new_mnt))
return ERR_CAST(new_mnt);
+ /* Longterm mount to be removed by kern_unmount*() */
+ new_mnt->mnt_ns = MNT_NS_INTERNAL;
+
return &new_mnt->mnt;
}
EXPORT_SYMBOL_GPL(clone_private_mount);
@@ -3863,6 +3863,19 @@ void kern_unmount(struct vfsmount *mnt)
}
EXPORT_SYMBOL(kern_unmount);
+void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
+{
+ unsigned int i;
+
+ for (i = 0; i < num; i++)
+ if (mnt[i])
+ real_mount(mnt[i])->mnt_ns = NULL;
+ synchronize_rcu_expedited();
+ for (i = 0; i < num; i++)
+ mntput(mnt[i]);
+}
+EXPORT_SYMBOL(kern_unmount_array);
+
bool our_mnt(struct vfsmount *mnt)
{
return check_mnt(real_mount(mnt));
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a57e7c72c7f4..1b79dd5cf661 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -446,7 +446,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = mapping->host;
struct nfs_direct_req *dreq;
struct nfs_lock_context *l_ctx;
- ssize_t result = -EINVAL, requested;
+ ssize_t result, requested;
size_t count = iov_iter_count(iter);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
@@ -731,6 +731,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
nfs_list_remove_request(req);
if (request_commit) {
kref_get(&req->wb_kref);
+ memcpy(&req->wb_verf, &hdr->verf.verifier,
+ sizeof(req->wb_verf));
nfs_mark_request_commit(req, hdr->lseg, &cinfo,
hdr->ds_commit_idx);
}
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 963800037609..e87d500ad95a 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -39,7 +39,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
#include <linux/string.h>
#include <linux/kmod.h>
#include <linux/slab.h>
-#include <linux/module.h>
#include <linux/socket.h>
#include <linux/seq_file.h>
#include <linux/inet.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b9d0921cb4fe..0bf1f835de01 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -833,6 +833,8 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
do_update |= cache_validity & NFS_INO_INVALID_ATIME;
if (request_mask & (STATX_CTIME|STATX_MTIME))
do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE;
+ if (request_mask & STATX_BLOCKS)
+ do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
if (do_update) {
/* Update the attribute cache */
if (!(server->flags & NFS_MOUNT_NOAC))
@@ -1764,7 +1766,8 @@ out_noforce:
status = nfs_post_op_update_inode_locked(inode, fattr,
NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_CTIME
- | NFS_INO_INVALID_MTIME);
+ | NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_BLOCKS);
return status;
}
@@ -1871,7 +1874,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ATIME
| NFS_INO_REVAL_FORCED
- | NFS_INO_REVAL_PAGECACHE);
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_INVALID_BLOCKS);
/* Do atomic weak cache consistency updates */
nfs_wcc_update_inode(inode, fattr);
@@ -2033,8 +2037,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
} else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
- else
+ else {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_BLOCKS
+ | NFS_INO_REVAL_FORCED);
cache_revalidated = false;
+ }
/* Update attrtimeo value if we're out of the unstable period */
if (attr_changed) {
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a46d1d5d16d8..2397ceedba8a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -179,11 +179,11 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
if (nfs_lookup_is_soft_revalidate(dentry))
task_flags |= RPC_TASK_TIMEOUT;
- dprintk("NFS call lookup %pd2\n", dentry);
res.dir_attr = nfs_alloc_fattr();
if (res.dir_attr == NULL)
return -ENOMEM;
+ dprintk("NFS call lookup %pd2\n", dentry);
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags);
nfs_refresh_inode(dir, res.dir_attr);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9056f3dd380e..e32717fd1169 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7909,7 +7909,7 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
}
static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = {
- .rpc_call_done = &nfs4_bind_one_conn_to_session_done,
+ .rpc_call_done = nfs4_bind_one_conn_to_session_done,
};
/*
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 7e7a97ae21ed..547cec79899f 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -961,6 +961,97 @@ TRACE_EVENT(nfs_readpage_done,
)
);
+TRACE_EVENT(nfs_readpage_short,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(task, hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(bool, eof)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->status = task->tk_status;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->eof = hdr->res.eof;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u status=%d%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count,
+ __entry->res_count, __entry->status,
+ __entry->eof ? " eof" : ""
+ )
+);
+
+TRACE_EVENT(nfs_pgio_error,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr,
+ int error,
+ loff_t pos
+ ),
+
+ TP_ARGS(hdr, error, pos),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(loff_t, pos)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->status = error;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u pos=%llu status=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count, __entry->res_count,
+ __entry->pos, __entry->status
+ )
+);
+
TRACE_DEFINE_ENUM(NFS_UNSTABLE);
TRACE_DEFINE_ENUM(NFS_DATA_SYNC);
TRACE_DEFINE_ENUM(NFS_FILE_SYNC);
@@ -1312,7 +1403,12 @@ TRACE_EVENT(nfs_xdr_status,
__field(unsigned int, task_id)
__field(unsigned int, client_id)
__field(u32, xid)
+ __field(int, version)
__field(unsigned long, error)
+ __string(program,
+ xdr->rqst->rq_task->tk_client->cl_program->name)
+ __string(procedure,
+ xdr->rqst->rq_task->tk_msg.rpc_proc->p_name)
),
TP_fast_assign(
@@ -1322,13 +1418,19 @@ TRACE_EVENT(nfs_xdr_status,
__entry->task_id = task->tk_pid;
__entry->client_id = task->tk_client->cl_clid;
__entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->version = task->tk_client->cl_vers;
__entry->error = error;
+ __assign_str(program,
+ task->tk_client->cl_program->name)
+ __assign_str(procedure, task->tk_msg.rpc_proc->p_name)
),
TP_printk(
- "task:%u@%d xid=0x%08x error=%ld (%s)",
+ "task:%u@%d xid=0x%08x %sv%d %s error=%ld (%s)",
__entry->task_id, __entry->client_id, __entry->xid,
- -__entry->error, nfs_show_status(__entry->error)
+ __get_str(program), __entry->version,
+ __get_str(procedure), -__entry->error,
+ nfs_show_status(__entry->error)
)
);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6ca421cbe19c..6ea4cac41e46 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -24,6 +24,7 @@
#include "internal.h"
#include "pnfs.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -64,6 +65,7 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
{
unsigned int new = pos - hdr->io_start;
+ trace_nfs_pgio_error(hdr, error, pos);
if (hdr->good_bytes > new) {
hdr->good_bytes = new;
clear_bit(NFS_IOHDR_EOF, &hdr->flags);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 13b22e898116..eb854f1f86e2 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -264,6 +264,8 @@ static void nfs_readpage_retry(struct rpc_task *task,
/* This is a short read! */
nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
+ trace_nfs_readpage_short(task, hdr);
+
/* Has the server at least made some progress? */
if (resp->count == 0) {
nfs_set_pgio_error(hdr, -EIO, argp->offset);
diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h
index f1b27411dcc0..ebcbdc40483b 100644
--- a/fs/nfs/sysfs.h
+++ b/fs/nfs/sysfs.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2019 Hammerspace Inc
*/
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 10ec5ecdf117..65c331f75e9c 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -78,6 +78,8 @@ enum {
/* Checksum this amount of the request */
#define RC_CSUMLEN (256U)
+int nfsd_drc_slab_create(void);
+void nfsd_drc_slab_free(void);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
int nfsd_cache_lookup(struct svc_rqst *);
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 09aa545825bd..9217cb64bf0e 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -139,7 +139,6 @@ struct nfsd_net {
* Duplicate reply cache
*/
struct nfsd_drc_bucket *drc_hashtbl;
- struct kmem_cache *drc_slab;
/* max number of entries allowed in the cache */
unsigned int max_drc_entries;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 5cf91322de0f..7fbe9840a03e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -38,6 +38,7 @@
#include "nfsd.h"
#include "state.h"
#include "netns.h"
+#include "trace.h"
#include "xdr4cb.h"
#include "xdr4.h"
@@ -904,16 +905,20 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
if (clp->cl_minorversion == 0) {
if (!clp->cl_cred.cr_principal &&
- (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5))
+ (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) {
+ trace_nfsd_cb_setup_err(clp, -EINVAL);
return -EINVAL;
+ }
args.client_name = clp->cl_cred.cr_principal;
args.prognumber = conn->cb_prog;
args.protocol = XPRT_TRANSPORT_TCP;
args.authflavor = clp->cl_cred.cr_flavor;
clp->cl_cb_ident = conn->cb_ident;
} else {
- if (!conn->cb_xprt)
+ if (!conn->cb_xprt) {
+ trace_nfsd_cb_setup_err(clp, -EINVAL);
return -EINVAL;
+ }
clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_session = ses;
args.bc_xprt = conn->cb_xprt;
@@ -925,32 +930,27 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
/* Create RPC client */
client = rpc_create(&args);
if (IS_ERR(client)) {
- dprintk("NFSD: couldn't create callback client: %ld\n",
- PTR_ERR(client));
+ trace_nfsd_cb_setup_err(clp, PTR_ERR(client));
return PTR_ERR(client);
}
cred = get_backchannel_cred(clp, client, ses);
if (!cred) {
+ trace_nfsd_cb_setup_err(clp, -ENOMEM);
rpc_shutdown_client(client);
return -ENOMEM;
}
clp->cl_cb_client = client;
clp->cl_cb_cred = cred;
+ trace_nfsd_cb_setup(clp);
return 0;
}
-static void warn_no_callback_path(struct nfs4_client *clp, int reason)
-{
- dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
- (int)clp->cl_name.len, clp->cl_name.data, reason);
-}
-
static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
{
if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
return;
clp->cl_cb_state = NFSD4_CB_DOWN;
- warn_no_callback_path(clp, reason);
+ trace_nfsd_cb_state(clp);
}
static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
@@ -958,17 +958,20 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
return;
clp->cl_cb_state = NFSD4_CB_FAULT;
- warn_no_callback_path(clp, reason);
+ trace_nfsd_cb_state(clp);
}
static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
{
struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
+ trace_nfsd_cb_done(clp, task->tk_status);
if (task->tk_status)
nfsd4_mark_cb_down(clp, task->tk_status);
- else
+ else {
clp->cl_cb_state = NFSD4_CB_UP;
+ trace_nfsd_cb_state(clp);
+ }
}
static void nfsd4_cb_probe_release(void *calldata)
@@ -993,6 +996,7 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
void nfsd4_probe_callback(struct nfs4_client *clp)
{
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ trace_nfsd_cb_state(clp);
set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
nfsd4_run_cb(&clp->cl_cb_null);
}
@@ -1009,6 +1013,7 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
spin_lock(&clp->cl_lock);
memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
spin_unlock(&clp->cl_lock);
+ trace_nfsd_cb_state(clp);
}
/*
@@ -1165,8 +1170,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
struct nfsd4_callback *cb = calldata;
struct nfs4_client *clp = cb->cb_clp;
- dprintk("%s: minorversion=%d\n", __func__,
- clp->cl_minorversion);
+ trace_nfsd_cb_done(clp, task->tk_status);
if (!nfsd4_cb_sequence_done(task, cb))
return;
@@ -1271,6 +1275,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
* kill the old client:
*/
if (clp->cl_cb_client) {
+ trace_nfsd_cb_shutdown(clp);
rpc_shutdown_client(clp->cl_cb_client);
clp->cl_cb_client = NULL;
put_cred(clp->cl_cb_cred);
@@ -1301,6 +1306,8 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
err = setup_callback_client(clp, &conn, ses);
if (err) {
nfsd4_mark_cb_down(clp, err);
+ if (c)
+ svc_xprt_put(c->cn_xprt);
return;
}
}
@@ -1314,6 +1321,8 @@ nfsd4_run_cb_work(struct work_struct *work)
struct rpc_clnt *clnt;
int flags;
+ trace_nfsd_cb_work(clp, cb->cb_msg.rpc_proc->p_name);
+
if (cb->cb_need_restart) {
cb->cb_need_restart = false;
} else {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0e75f7fb5fec..a09c35f0f6f0 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1155,7 +1155,7 @@ extern void nfs_sb_deactive(struct super_block *sb);
#define NFSD42_INTERSSC_MOUNTOPS "vers=4.2,addr=%s,sec=sys"
-/**
+/*
* Support one copy source server for now.
*/
static __be32
@@ -1245,10 +1245,9 @@ nfsd4_interssc_disconnect(struct vfsmount *ss_mnt)
mntput(ss_mnt);
}
-/**
- * nfsd4_setup_inter_ssc
- *
+/*
* Verify COPY destination stateid.
+ *
* Connect to the source server with NFSv4.1.
* Create the source struct file for nfsd_copy_range.
* Called with COPY cstate:
@@ -2302,6 +2301,8 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
}
check_if_stalefh_allowed(args);
+ rqstp->rq_lease_breaker = (void **)&cstate->clp;
+
trace_nfsd_compound(rqstp, args->opcnt);
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c107caa56525..bb3d2c32664a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -51,6 +51,7 @@
#include "netns.h"
#include "pnfs.h"
#include "filecache.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -167,9 +168,6 @@ renew_client_locked(struct nfs4_client *clp)
return;
}
- dprintk("renewing client (clientid %08x/%08x)\n",
- clp->cl_clientid.cl_boot,
- clp->cl_clientid.cl_id);
list_move_tail(&clp->cl_lru, &nn->client_lru);
clp->cl_time = ktime_get_boottime_seconds();
}
@@ -1922,8 +1920,7 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
*/
if (clid->cl_boot == (u32)nn->boot_time)
return 0;
- dprintk("NFSD stale clientid (%08x/%08x) boot_time %08llx\n",
- clid->cl_boot, clid->cl_id, nn->boot_time);
+ trace_nfsd_clid_stale(clid);
return 1;
}
@@ -2406,6 +2403,11 @@ static void states_stop(struct seq_file *s, void *v)
spin_unlock(&clp->cl_lock);
}
+static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f)
+{
+ seq_printf(s, "filename: \"%pD2\"", f->nf_file);
+}
+
static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
{
struct inode *inode = f->nf_inode;
@@ -2422,6 +2424,12 @@ static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo)
seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len);
}
+static void nfs4_show_stateid(struct seq_file *s, stateid_t *stid)
+{
+ seq_printf(s, "0x%.8x", stid->si_generation);
+ seq_printf(s, "%12phN", &stid->si_opaque);
+}
+
static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
{
struct nfs4_ol_stateid *ols;
@@ -2437,7 +2445,9 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
nf = st->sc_file;
file = find_any_file(nf);
- seq_printf(s, "- 0x%16phN: { type: open, ", &st->sc_stateid);
+ seq_printf(s, "- ");
+ nfs4_show_stateid(s, &st->sc_stateid);
+ seq_printf(s, ": { type: open, ");
access = bmap_to_share_mode(ols->st_access_bmap);
deny = bmap_to_share_mode(ols->st_deny_bmap);
@@ -2451,6 +2461,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
nfs4_show_superblock(s, file);
seq_printf(s, ", ");
+ nfs4_show_fname(s, file);
+ seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
nfsd_file_put(file);
@@ -2470,7 +2482,9 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
nf = st->sc_file;
file = find_any_file(nf);
- seq_printf(s, "- 0x%16phN: { type: lock, ", &st->sc_stateid);
+ seq_printf(s, "- ");
+ nfs4_show_stateid(s, &st->sc_stateid);
+ seq_printf(s, ": { type: lock, ");
/*
* Note: a lock stateid isn't really the same thing as a lock,
@@ -2482,6 +2496,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
nfs4_show_superblock(s, file);
/* XXX: open stateid? */
seq_printf(s, ", ");
+ nfs4_show_fname(s, file);
+ seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
nfsd_file_put(file);
@@ -2499,7 +2515,9 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
nf = st->sc_file;
file = nf->fi_deleg_file;
- seq_printf(s, "- 0x%16phN: { type: deleg, ", &st->sc_stateid);
+ seq_printf(s, "- ");
+ nfs4_show_stateid(s, &st->sc_stateid);
+ seq_printf(s, ": { type: deleg, ");
/* Kinda dead code as long as we only support read delegs: */
seq_printf(s, "access: %s, ",
@@ -2508,6 +2526,8 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
/* XXX: lease time, whether it's being recalled. */
nfs4_show_superblock(s, file);
+ seq_printf(s, ", ");
+ nfs4_show_fname(s, file);
seq_printf(s, " }\n");
return 0;
@@ -2521,11 +2541,15 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
file = ls->ls_file;
- seq_printf(s, "- 0x%16phN: { type: layout, ", &st->sc_stateid);
+ seq_printf(s, "- ");
+ nfs4_show_stateid(s, &st->sc_stateid);
+ seq_printf(s, ": { type: layout, ");
/* XXX: What else would be useful? */
nfs4_show_superblock(s, file);
+ seq_printf(s, ", ");
+ nfs4_show_fname(s, file);
seq_printf(s, " }\n");
return 0;
@@ -2845,14 +2869,12 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
conn->cb_prog = se->se_callback_prog;
conn->cb_ident = se->se_callback_ident;
memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
+ trace_nfsd_cb_args(clp, conn);
return;
out_err:
conn->cb_addr.ss_family = AF_UNSPEC;
conn->cb_addrlen = 0;
- dprintk("NFSD: this client (clientid %08x/%08x) "
- "will not receive delegations\n",
- clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
-
+ trace_nfsd_cb_nodelegs(clp);
return;
}
@@ -3458,6 +3480,45 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp,
return nfs_ok;
}
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
+{
+ struct nfsd4_conn *c;
+
+ list_for_each_entry(c, &s->se_conns, cn_persession) {
+ if (c->cn_xprt == xpt) {
+ return c;
+ }
+ }
+ return NULL;
+}
+
+static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst,
+ struct nfsd4_session *session, u32 req)
+{
+ struct nfs4_client *clp = session->se_client;
+ struct svc_xprt *xpt = rqst->rq_xprt;
+ struct nfsd4_conn *c;
+ __be32 status;
+
+ /* Following the last paragraph of RFC 5661 Section 18.34.3: */
+ spin_lock(&clp->cl_lock);
+ c = __nfsd4_find_conn(xpt, session);
+ if (!c)
+ status = nfserr_noent;
+ else if (req == c->cn_flags)
+ status = nfs_ok;
+ else if (req == NFS4_CDFC4_FORE_OR_BOTH &&
+ c->cn_flags != NFS4_CDFC4_BACK)
+ status = nfs_ok;
+ else if (req == NFS4_CDFC4_BACK_OR_BOTH &&
+ c->cn_flags != NFS4_CDFC4_FORE)
+ status = nfs_ok;
+ else
+ status = nfserr_inval;
+ spin_unlock(&clp->cl_lock);
+ return status;
+}
+
__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
@@ -3479,6 +3540,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
status = nfserr_wrong_cred;
if (!nfsd4_mach_creds_match(session->se_client, rqstp))
goto out;
+ status = nfsd4_match_existing_connection(rqstp, session, bcts->dir);
+ if (status == nfs_ok || status == nfserr_inval)
+ goto out;
status = nfsd4_map_bcts_dir(&bcts->dir);
if (status)
goto out;
@@ -3544,18 +3608,6 @@ out:
return status;
}
-static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
-{
- struct nfsd4_conn *c;
-
- list_for_each_entry(c, &s->se_conns, cn_persession) {
- if (c->cn_xprt == xpt) {
- return c;
- }
- }
- return NULL;
-}
-
static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
{
struct nfs4_client *clp = ses->se_client;
@@ -3879,23 +3931,18 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (clp_used_exchangeid(conf))
goto out;
if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
- char addr_str[INET6_ADDRSTRLEN];
- rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
- sizeof(addr_str));
- dprintk("NFSD: setclientid: string in use by client "
- "at %s\n", addr_str);
+ trace_nfsd_clid_inuse_err(conf);
goto out;
}
}
unconf = find_unconfirmed_client_by_name(&clname, nn);
if (unconf)
unhash_client_locked(unconf);
+ /* We need to handle only case 1: probable callback update */
if (conf && same_verf(&conf->cl_verifier, &clverifier)) {
- /* case 1: probable callback update */
copy_clid(new, conf);
gen_confirm(new, nn);
- } else /* case 4 (new client) or cases 2, 3 (client reboot): */
- ;
+ }
new->cl_minorversion = 0;
gen_callback(new, setclid, rqstp);
add_to_unconfirmed(new);
@@ -4076,7 +4123,6 @@ out_free_openowner_slab:
out_free_client_slab:
kmem_cache_destroy(client_slab);
out:
- dprintk("nfsd4: out of memory while initializing nfsv4\n");
return -ENOMEM;
}
@@ -4508,6 +4554,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
struct nfs4_file *fp = dp->dl_stid.sc_file;
+ trace_nfsd_deleg_break(&dp->dl_stid.sc_stateid);
+
/*
* We don't want the locks code to timeout the lease for us;
* we'll remove it ourself if a delegation isn't returned
@@ -4522,6 +4570,19 @@ nfsd_break_deleg_cb(struct file_lock *fl)
return ret;
}
+static bool nfsd_breaker_owns_lease(struct file_lock *fl)
+{
+ struct nfs4_delegation *dl = fl->fl_owner;
+ struct svc_rqst *rqst;
+ struct nfs4_client *clp;
+
+ if (!i_am_nfsd())
+ return NULL;
+ rqst = kthread_data(current);
+ clp = *(rqst->rq_lease_breaker);
+ return dl->dl_stid.sc_client == clp;
+}
+
static int
nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
struct list_head *dispose)
@@ -4533,6 +4594,7 @@ nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
}
static const struct lock_manager_operations nfsd_lease_mng_ops = {
+ .lm_breaker_owns_lease = nfsd_breaker_owns_lease,
.lm_break = nfsd_break_deleg_cb,
.lm_change = nfsd_change_deleg_cb,
};
@@ -5018,8 +5080,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
- dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
- STATEID_VAL(&dp->dl_stid.sc_stateid));
+ trace_nfsd_deleg_open(&dp->dl_stid.sc_stateid);
open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
nfs4_put_stid(&dp->dl_stid);
return;
@@ -5136,9 +5197,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
nfs4_open_delegation(current_fh, open, stp);
nodeleg:
status = nfs_ok;
-
- dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
- STATEID_VAL(&stp->st_stid.sc_stateid));
+ trace_nfsd_deleg_none(&stp->st_stid.sc_stateid);
out:
/* 4.1 client trying to upgrade/downgrade delegation? */
if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp &&
@@ -5192,8 +5251,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- dprintk("process_renew(%08x/%08x): starting\n",
- clid->cl_boot, clid->cl_id);
+ trace_nfsd_clid_renew(clid);
status = lookup_clientid(clid, cstate, nn, false);
if (status)
goto out;
@@ -5214,6 +5272,7 @@ nfsd4_end_grace(struct nfsd_net *nn)
if (nn->grace_ended)
return;
+ trace_nfsd_grace_complete(nn);
nn->grace_ended = true;
/*
* If the server goes down again right now, an NFSv4
@@ -5279,13 +5338,10 @@ nfs4_laundromat(struct nfsd_net *nn)
copy_stateid_t *cps_t;
int i;
- dprintk("NFSD: laundromat service - starting\n");
-
if (clients_still_reclaiming(nn)) {
new_timeo = 0;
goto out;
}
- dprintk("NFSD: end of grace period\n");
nfsd4_end_grace(nn);
INIT_LIST_HEAD(&reaplist);
@@ -5307,8 +5363,7 @@ nfs4_laundromat(struct nfsd_net *nn)
break;
}
if (mark_client_expired_locked(clp)) {
- dprintk("NFSD: client in use (clientid %08x)\n",
- clp->cl_clientid.cl_id);
+ trace_nfsd_clid_expired(&clp->cl_clientid);
continue;
}
list_add(&clp->cl_lru, &reaplist);
@@ -5316,8 +5371,7 @@ nfs4_laundromat(struct nfsd_net *nn)
spin_unlock(&nn->client_lock);
list_for_each_safe(pos, next, &reaplist) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
- dprintk("NFSD: purging unused client (clientid %08x)\n",
- clp->cl_clientid.cl_id);
+ trace_nfsd_clid_purged(&clp->cl_clientid);
list_del_init(&clp->cl_lru);
expire_client(clp);
}
@@ -5407,7 +5461,6 @@ laundromat_main(struct work_struct *laundry)
laundromat_work);
t = nfs4_laundromat(nn);
- dprintk("NFSD: laundromat_main - sleeping for %lld seconds\n", t);
queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
}
@@ -5948,8 +6001,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
struct nfs4_stid *s;
struct nfs4_ol_stateid *stp = NULL;
- dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
- seqid, STATEID_VAL(stateid));
+ trace_nfsd_preprocess(seqid, stateid);
*stpp = NULL;
status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
@@ -6018,9 +6070,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
oo->oo_flags |= NFS4_OO_CONFIRMED;
nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
mutex_unlock(&stp->st_mutex);
- dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
- __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
-
+ trace_nfsd_open_confirm(oc->oc_seqid, &stp->st_stid.sc_stateid);
nfsd4_client_record_create(oo->oo_owner.so_client);
status = nfs_ok;
put_stateid:
@@ -7072,7 +7122,7 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
unsigned int strhashval;
struct nfs4_client_reclaim *crp;
- dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", name.len, name.data);
+ trace_nfsd_clid_reclaim(nn, name.len, name.data);
crp = alloc_reclaim();
if (crp) {
strhashval = clientstr_hashval(name);
@@ -7122,7 +7172,7 @@ nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn)
unsigned int strhashval;
struct nfs4_client_reclaim *crp = NULL;
- dprintk("NFSD: nfs4_find_reclaim_client for name %.*s\n", name.len, name.data);
+ trace_nfsd_clid_find(nn, name.len, name.data);
strhashval = clientstr_hashval(name);
list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
@@ -7686,6 +7736,9 @@ nfsd_recall_delegations(struct list_head *reaplist)
list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
list_del_init(&dp->dl_recall_lru);
clp = dp->dl_stid.sc_client;
+
+ trace_nfsd_deleg_recall(&dp->dl_stid.sc_stateid);
+
/*
* We skipped all entries that had a zero dl_time before,
* so we can now reset the dl_time back to 0. If a delegation
@@ -7868,6 +7921,7 @@ nfs4_state_start_net(struct net *net)
goto skip_grace;
printk(KERN_INFO "NFSD: starting %lld-second grace period (net %x)\n",
nn->nfsd4_grace, net->ns.inum);
+ trace_nfsd_grace_start(nn);
queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
return 0;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 96352ab7bd81..0a0cf1fd77d3 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -20,8 +20,7 @@
#include "nfsd.h"
#include "cache.h"
-
-#define NFSDDBG_FACILITY NFSDDBG_REPCACHE
+#include "trace.h"
/*
* We use this value to determine the number of hash buckets from the max
@@ -36,6 +35,8 @@ struct nfsd_drc_bucket {
spinlock_t cache_lock;
};
+static struct kmem_cache *drc_slab;
+
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
@@ -95,7 +96,7 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
{
struct svc_cacherep *rp;
- rp = kmem_cache_alloc(nn->drc_slab, GFP_KERNEL);
+ rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
if (rp) {
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
@@ -129,7 +130,7 @@ nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
atomic_dec(&nn->num_drc_entries);
nn->drc_mem_usage -= sizeof(*rp);
}
- kmem_cache_free(nn->drc_slab, rp);
+ kmem_cache_free(drc_slab, rp);
}
static void
@@ -141,6 +142,18 @@ nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
spin_unlock(&b->cache_lock);
}
+int nfsd_drc_slab_create(void)
+{
+ drc_slab = kmem_cache_create("nfsd_drc",
+ sizeof(struct svc_cacherep), 0, 0, NULL);
+ return drc_slab ? 0: -ENOMEM;
+}
+
+void nfsd_drc_slab_free(void)
+{
+ kmem_cache_destroy(drc_slab);
+}
+
int nfsd_reply_cache_init(struct nfsd_net *nn)
{
unsigned int hashsize;
@@ -159,18 +172,13 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
if (status)
goto out_nomem;
- nn->drc_slab = kmem_cache_create("nfsd_drc",
- sizeof(struct svc_cacherep), 0, 0, NULL);
- if (!nn->drc_slab)
- goto out_shrinker;
-
nn->drc_hashtbl = kcalloc(hashsize,
sizeof(*nn->drc_hashtbl), GFP_KERNEL);
if (!nn->drc_hashtbl) {
nn->drc_hashtbl = vzalloc(array_size(hashsize,
sizeof(*nn->drc_hashtbl)));
if (!nn->drc_hashtbl)
- goto out_slab;
+ goto out_shrinker;
}
for (i = 0; i < hashsize; i++) {
@@ -180,8 +188,6 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
nn->drc_hashsize = hashsize;
return 0;
-out_slab:
- kmem_cache_destroy(nn->drc_slab);
out_shrinker:
unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
out_nomem:
@@ -209,8 +215,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
nn->drc_hashtbl = NULL;
nn->drc_hashsize = 0;
- kmem_cache_destroy(nn->drc_slab);
- nn->drc_slab = NULL;
}
/*
@@ -323,8 +327,10 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key,
const struct svc_cacherep *rp, struct nfsd_net *nn)
{
if (key->c_key.k_xid == rp->c_key.k_xid &&
- key->c_key.k_csum != rp->c_key.k_csum)
+ key->c_key.k_csum != rp->c_key.k_csum) {
++nn->payload_misses;
+ trace_nfsd_drc_mismatch(nn, key, rp);
+ }
return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key));
}
@@ -377,15 +383,22 @@ out:
return ret;
}
-/*
+/**
+ * nfsd_cache_lookup - Find an entry in the duplicate reply cache
+ * @rqstp: Incoming Call to find
+ *
* Try to find an entry matching the current call in the cache. When none
* is found, we try to grab the oldest expired entry off the LRU list. If
* a suitable one isn't there, then drop the cache_lock and allocate a
* new one, then search again in case one got inserted while this thread
* didn't hold the lock.
+ *
+ * Return values:
+ * %RC_DOIT: Process the request normally
+ * %RC_REPLY: Reply from cache
+ * %RC_DROPIT: Do not process the request further
*/
-int
-nfsd_cache_lookup(struct svc_rqst *rqstp)
+int nfsd_cache_lookup(struct svc_rqst *rqstp)
{
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct svc_cacherep *rp, *found;
@@ -399,7 +412,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
rqstp->rq_cacherep = NULL;
if (type == RC_NOCACHE) {
nfsdstats.rcnocache++;
- return rtn;
+ goto out;
}
csum = nfsd_cache_csum(rqstp);
@@ -409,10 +422,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
* preallocate an entry.
*/
rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
- if (!rp) {
- dprintk("nfsd: unable to allocate DRC entry!\n");
- return rtn;
- }
+ if (!rp)
+ goto out;
spin_lock(&b->cache_lock);
found = nfsd_cache_insert(b, rp, nn);
@@ -431,8 +442,10 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
/* go ahead and prune the cache */
prune_bucket(b, nn);
- out:
+
+out_unlock:
spin_unlock(&b->cache_lock);
+out:
return rtn;
found_entry:
@@ -442,13 +455,13 @@ found_entry:
/* Request being processed */
if (rp->c_state == RC_INPROG)
- goto out;
+ goto out_trace;
/* From the hall of fame of impractical attacks:
* Is this a user who tries to snoop on the cache? */
rtn = RC_DOIT;
if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure)
- goto out;
+ goto out_trace;
/* Compose RPC reply header */
switch (rp->c_type) {
@@ -460,21 +473,26 @@ found_entry:
break;
case RC_REPLBUFF:
if (!nfsd_cache_append(rqstp, &rp->c_replvec))
- goto out; /* should not happen */
+ goto out_unlock; /* should not happen */
rtn = RC_REPLY;
break;
default:
- printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
- nfsd_reply_cache_free_locked(b, rp, nn);
+ WARN_ONCE(1, "nfsd: bad repcache type %d\n", rp->c_type);
}
- goto out;
+out_trace:
+ trace_nfsd_drc_found(nn, rqstp, rtn);
+ goto out_unlock;
}
-/*
- * Update a cache entry. This is called from nfsd_dispatch when
- * the procedure has been executed and the complete reply is in
- * rqstp->rq_res.
+/**
+ * nfsd_cache_update - Update an entry in the duplicate reply cache.
+ * @rqstp: svc_rqst with a finished Reply
+ * @cachetype: which cache to update
+ * @statp: Reply's status code
+ *
+ * This is called from nfsd_dispatch when the procedure has been
+ * executed and the complete reply is in rqstp->rq_res.
*
* We're copying around data here rather than swapping buffers because
* the toplevel loop requires max-sized buffers, which would be a waste
@@ -487,8 +505,7 @@ found_entry:
* nfsd failed to encode a reply that otherwise would have been cached.
* In this case, nfsd_cache_update is called with statp == NULL.
*/
-void
-nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
{
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct svc_cacherep *rp = rqstp->rq_cacherep;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3bb2db947d29..b68e96681522 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -238,7 +238,7 @@ static inline struct net *netns(struct file *file)
return file_inode(file)->i_sb->s_fs_info;
}
-/**
+/*
* write_unlock_ip - Release all locks used by a client
*
* Experimental.
@@ -277,7 +277,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
return nlmsvc_unlock_all_by_ip(sap);
}
-/**
+/*
* write_unlock_fs - Release all locks on a local file system
*
* Experimental.
@@ -327,7 +327,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
return error;
}
-/**
+/*
* write_filehandle - Get a variable-length NFS file handle by path
*
* On input, the buffer contains a '\n'-terminated C string comprised of
@@ -402,7 +402,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
return mesg - buf;
}
-/**
+/*
* write_threads - Start NFSD, or report the current number of running threads
*
* Input:
@@ -452,7 +452,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
}
-/**
+/*
* write_pool_threads - Set or report the current number of threads per pool
*
* Input:
@@ -661,7 +661,7 @@ out:
return tlen + len;
}
-/**
+/*
* write_versions - Set or report the available NFS protocol versions
*
* Input:
@@ -811,7 +811,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size,
return -EINVAL;
}
-/**
+/*
* write_ports - Pass a socket file descriptor or transport name to listen on
*
* Input:
@@ -867,7 +867,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
int nfsd_max_blksize;
-/**
+/*
* write_maxblksize - Set or report the current NFS blksize
*
* Input:
@@ -917,7 +917,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
nfsd_max_blksize);
}
-/**
+/*
* write_maxconn - Set or report the current max number of connections
*
* Input:
@@ -998,7 +998,7 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
return rv;
}
-/**
+/*
* write_leasetime - Set or report the current NFSv4 lease time
*
* Input:
@@ -1025,7 +1025,7 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
}
-/**
+/*
* write_gracetime - Set or report current NFSv4 grace period time
*
* As above, but sets the time of the NFSv4 grace period.
@@ -1069,7 +1069,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
nfs4_recoverydir());
}
-/**
+/*
* write_recoverydir - Set or report the pathname of the recovery directory
*
* Input:
@@ -1101,7 +1101,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
return rv;
}
-/**
+/*
* write_v4_end_grace - release grace period for nfsd's v4.x lock manager
*
* Input:
@@ -1533,6 +1533,9 @@ static int __init init_nfsd(void)
goto out_free_slabs;
nfsd_fault_inject_init(); /* nfsd fault injection controls */
nfsd_stat_init(); /* Statistics */
+ retval = nfsd_drc_slab_create();
+ if (retval)
+ goto out_free_stat;
nfsd_lockd_init(); /* lockd->nfsd callbacks */
retval = create_proc_exports_entry();
if (retval)
@@ -1546,6 +1549,8 @@ out_free_all:
remove_proc_entry("fs/nfs", NULL);
out_free_lockd:
nfsd_lockd_shutdown();
+ nfsd_drc_slab_free();
+out_free_stat:
nfsd_stat_shutdown();
nfsd_fault_inject_cleanup();
nfsd4_exit_pnfs();
@@ -1560,6 +1565,7 @@ out_unregister_pernet:
static void __exit exit_nfsd(void)
{
+ nfsd_drc_slab_free();
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
nfsd_stat_shutdown();
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 2ab5569126b8..36cdd81b6688 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -88,6 +88,8 @@ int nfsd_pool_stats_release(struct inode *, struct file *);
void nfsd_destroy(struct net *net);
+bool i_am_nfsd(void);
+
struct nfsdfs_client {
struct kref cl_ref;
void (*cl_release)(struct kref *kref);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ca9fd348548b..b603dfcdd361 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -601,6 +601,11 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = {
.svo_module = THIS_MODULE,
};
+bool i_am_nfsd(void)
+{
+ return kthread_func(current) == nfsd;
+}
+
int nfsd_create_serv(struct net *net)
{
int error;
@@ -1011,6 +1016,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
*statp = rpc_garbage_args;
return 1;
}
+ rqstp->rq_lease_breaker = NULL;
/*
* Give the xdr decoder a chance to change this if it wants
* (necessary in the NFSv4.0 compound case)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 68d3f30ee760..3b408532a5dc 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -64,13 +64,6 @@ typedef struct {
refcount_t sc_count;
} copy_stateid_t;
-#define STATEID_FMT "(%08x/%08x/%08x/%08x)"
-#define STATEID_VAL(s) \
- (s)->si_opaque.so_clid.cl_boot, \
- (s)->si_opaque.so_clid.cl_id, \
- (s)->si_opaque.so_id, \
- (s)->si_generation
-
struct nfsd4_callback {
struct nfs4_client *cb_clp;
struct rpc_message cb_msg;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 78c574251c60..1861db1bdc67 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -277,6 +277,7 @@ DECLARE_EVENT_CLASS(nfsd_stateid_class,
DEFINE_EVENT(nfsd_stateid_class, nfsd_##name, \
TP_PROTO(stateid_t *stp), \
TP_ARGS(stp))
+
DEFINE_STATEID_EVENT(layoutstate_alloc);
DEFINE_STATEID_EVENT(layoutstate_unhash);
DEFINE_STATEID_EVENT(layoutstate_free);
@@ -288,6 +289,138 @@ DEFINE_STATEID_EVENT(layout_recall_done);
DEFINE_STATEID_EVENT(layout_recall_fail);
DEFINE_STATEID_EVENT(layout_recall_release);
+DEFINE_STATEID_EVENT(deleg_open);
+DEFINE_STATEID_EVENT(deleg_none);
+DEFINE_STATEID_EVENT(deleg_break);
+DEFINE_STATEID_EVENT(deleg_recall);
+
+DECLARE_EVENT_CLASS(nfsd_stateseqid_class,
+ TP_PROTO(u32 seqid, const stateid_t *stp),
+ TP_ARGS(seqid, stp),
+ TP_STRUCT__entry(
+ __field(u32, seqid)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ ),
+ TP_fast_assign(
+ __entry->seqid = seqid;
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ ),
+ TP_printk("seqid=%u client %08x:%08x stateid %08x:%08x",
+ __entry->seqid, __entry->cl_boot, __entry->cl_id,
+ __entry->si_id, __entry->si_generation)
+)
+
+#define DEFINE_STATESEQID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
+ TP_PROTO(u32 seqid, const stateid_t *stp), \
+ TP_ARGS(seqid, stp))
+
+DEFINE_STATESEQID_EVENT(preprocess);
+DEFINE_STATESEQID_EVENT(open_confirm);
+
+DECLARE_EVENT_CLASS(nfsd_clientid_class,
+ TP_PROTO(const clientid_t *clid),
+ TP_ARGS(clid),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clid->cl_boot;
+ __entry->cl_id = clid->cl_id;
+ ),
+ TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id)
+)
+
+#define DEFINE_CLIENTID_EVENT(name) \
+DEFINE_EVENT(nfsd_clientid_class, nfsd_clid_##name, \
+ TP_PROTO(const clientid_t *clid), \
+ TP_ARGS(clid))
+
+DEFINE_CLIENTID_EVENT(expired);
+DEFINE_CLIENTID_EVENT(purged);
+DEFINE_CLIENTID_EVENT(renew);
+DEFINE_CLIENTID_EVENT(stale);
+
+DECLARE_EVENT_CLASS(nfsd_net_class,
+ TP_PROTO(const struct nfsd_net *nn),
+ TP_ARGS(nn),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ ),
+ TP_printk("boot_time=%16llx", __entry->boot_time)
+)
+
+#define DEFINE_NET_EVENT(name) \
+DEFINE_EVENT(nfsd_net_class, nfsd_##name, \
+ TP_PROTO(const struct nfsd_net *nn), \
+ TP_ARGS(nn))
+
+DEFINE_NET_EVENT(grace_start);
+DEFINE_NET_EVENT(grace_complete);
+
+DECLARE_EVENT_CLASS(nfsd_clid_class,
+ TP_PROTO(const struct nfsd_net *nn,
+ unsigned int namelen,
+ const unsigned char *namedata),
+ TP_ARGS(nn, namelen, namedata),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(unsigned int, namelen)
+ __dynamic_array(unsigned char, name, namelen)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->namelen = namelen;
+ memcpy(__get_dynamic_array(name), namedata, namelen);
+ ),
+ TP_printk("boot_time=%16llx nfs4_clientid=%.*s",
+ __entry->boot_time, __entry->namelen, __get_str(name))
+)
+
+#define DEFINE_CLID_EVENT(name) \
+DEFINE_EVENT(nfsd_clid_class, nfsd_clid_##name, \
+ TP_PROTO(const struct nfsd_net *nn, \
+ unsigned int namelen, \
+ const unsigned char *namedata), \
+ TP_ARGS(nn, namelen, namedata))
+
+DEFINE_CLID_EVENT(find);
+DEFINE_CLID_EVENT(reclaim);
+
+TRACE_EVENT(nfsd_clid_inuse_err,
+ TP_PROTO(const struct nfs4_client *clp),
+ TP_ARGS(clp),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __field(unsigned int, namelen)
+ __dynamic_array(unsigned char, name, clp->cl_name.len)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ memcpy(__entry->addr, &clp->cl_addr,
+ sizeof(struct sockaddr_in6));
+ __entry->namelen = clp->cl_name.len;
+ memcpy(__get_dynamic_array(name), clp->cl_name.data,
+ clp->cl_name.len);
+ ),
+ TP_printk("nfs4_clientid %.*s already in use by %pISpc, client %08x:%08x",
+ __entry->namelen, __get_str(name), __entry->addr,
+ __entry->cl_boot, __entry->cl_id)
+)
+
TRACE_DEFINE_ENUM(NFSD_FILE_HASHED);
TRACE_DEFINE_ENUM(NFSD_FILE_PENDING);
TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ);
@@ -432,6 +565,218 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event,
__entry->nlink, __entry->mode, __entry->mask)
);
+#include "cache.h"
+
+TRACE_DEFINE_ENUM(RC_DROPIT);
+TRACE_DEFINE_ENUM(RC_REPLY);
+TRACE_DEFINE_ENUM(RC_DOIT);
+
+#define show_drc_retval(x) \
+ __print_symbolic(x, \
+ { RC_DROPIT, "DROPIT" }, \
+ { RC_REPLY, "REPLY" }, \
+ { RC_DOIT, "DOIT" })
+
+TRACE_EVENT(nfsd_drc_found,
+ TP_PROTO(
+ const struct nfsd_net *nn,
+ const struct svc_rqst *rqstp,
+ int result
+ ),
+ TP_ARGS(nn, rqstp, result),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(unsigned long, result)
+ __field(u32, xid)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->result = result;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ ),
+ TP_printk("boot_time=%16llx xid=0x%08x result=%s",
+ __entry->boot_time, __entry->xid,
+ show_drc_retval(__entry->result))
+
+);
+
+TRACE_EVENT(nfsd_drc_mismatch,
+ TP_PROTO(
+ const struct nfsd_net *nn,
+ const struct svc_cacherep *key,
+ const struct svc_cacherep *rp
+ ),
+ TP_ARGS(nn, key, rp),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(u32, xid)
+ __field(u32, cached)
+ __field(u32, ingress)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->xid = be32_to_cpu(key->c_key.k_xid);
+ __entry->cached = (__force u32)key->c_key.k_csum;
+ __entry->ingress = (__force u32)rp->c_key.k_csum;
+ ),
+ TP_printk("boot_time=%16llx xid=0x%08x cached-csum=0x%08x ingress-csum=0x%08x",
+ __entry->boot_time, __entry->xid, __entry->cached,
+ __entry->ingress)
+);
+
+TRACE_EVENT(nfsd_cb_args,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ const struct nfs4_cb_conn *conn
+ ),
+ TP_ARGS(clp, conn),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, prog)
+ __field(u32, ident)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __entry->prog = conn->cb_prog;
+ __entry->ident = conn->cb_ident;
+ memcpy(__entry->addr, &conn->cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("client %08x:%08x callback addr=%pISpc prog=%u ident=%u",
+ __entry->cl_boot, __entry->cl_id,
+ __entry->addr, __entry->prog, __entry->ident)
+);
+
+TRACE_EVENT(nfsd_cb_nodelegs,
+ TP_PROTO(const struct nfs4_client *clp),
+ TP_ARGS(clp),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ ),
+ TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id)
+)
+
+TRACE_DEFINE_ENUM(NFSD4_CB_UP);
+TRACE_DEFINE_ENUM(NFSD4_CB_UNKNOWN);
+TRACE_DEFINE_ENUM(NFSD4_CB_DOWN);
+TRACE_DEFINE_ENUM(NFSD4_CB_FAULT);
+
+#define show_cb_state(val) \
+ __print_symbolic(val, \
+ { NFSD4_CB_UP, "UP" }, \
+ { NFSD4_CB_UNKNOWN, "UNKNOWN" }, \
+ { NFSD4_CB_DOWN, "DOWN" }, \
+ { NFSD4_CB_FAULT, "FAULT"})
+
+DECLARE_EVENT_CLASS(nfsd_cb_class,
+ TP_PROTO(const struct nfs4_client *clp),
+ TP_ARGS(clp),
+ TP_STRUCT__entry(
+ __field(unsigned long, state)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->state = clp->cl_cb_state;
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x state=%s",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+ show_cb_state(__entry->state))
+);
+
+#define DEFINE_NFSD_CB_EVENT(name) \
+DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \
+ TP_PROTO(const struct nfs4_client *clp), \
+ TP_ARGS(clp))
+
+DEFINE_NFSD_CB_EVENT(setup);
+DEFINE_NFSD_CB_EVENT(state);
+DEFINE_NFSD_CB_EVENT(shutdown);
+
+TRACE_EVENT(nfsd_cb_setup_err,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ long error
+ ),
+ TP_ARGS(clp, error),
+ TP_STRUCT__entry(
+ __field(long, error)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x error=%ld",
+ __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error)
+);
+
+TRACE_EVENT(nfsd_cb_work,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ const char *procedure
+ ),
+ TP_ARGS(clp, procedure),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __string(procedure, procedure)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __assign_str(procedure, procedure)
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x procedure=%s",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_str(procedure))
+);
+
+TRACE_EVENT(nfsd_cb_done,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ int status
+ ),
+ TP_ARGS(clp, status),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(int, status)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __entry->status = status;
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x status=%d",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __entry->status)
+);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 445eef41bfaf..91b58c897f92 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2780,6 +2780,8 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
if (!nilfs->ns_writer)
return -ENOMEM;
+ inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL);
+
err = nilfs_segctor_start_thread(nilfs->ns_writer);
if (err) {
kfree(nilfs->ns_writer);
diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig
index 5a63303298e6..c7857e36adbb 100644
--- a/fs/nls/Kconfig
+++ b/fs/nls/Kconfig
@@ -5,7 +5,7 @@
menuconfig NLS
tristate "Native language support"
- ---help---
+ help
The base Native Language Support. A number of filesystems
depend on it (e.g. FAT, JOLIET, NT, BEOS filesystems), as well
as the ability of some filesystems to use native languages
@@ -21,7 +21,7 @@ if NLS
config NLS_DEFAULT
string "Default NLS Option"
default "iso8859-1"
- ---help---
+ help
The default NLS used when mounting file system. Note, that this is
the NLS used by your console, not the NLS used by a specific file
system (if different) to store data (filenames) on a disk.
@@ -76,7 +76,7 @@ config NLS_CODEPAGE_775
config NLS_CODEPAGE_850
tristate "Codepage 850 (Europe)"
- ---help---
+ help
The Microsoft FAT file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called DOS codepages. You need to include the appropriate
@@ -92,7 +92,7 @@ config NLS_CODEPAGE_850
config NLS_CODEPAGE_852
tristate "Codepage 852 (Central/Eastern Europe)"
- ---help---
+ help
The Microsoft FAT file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called DOS codepages. You need to include the appropriate
@@ -421,7 +421,7 @@ config NLS_ISO8859_14
config NLS_ISO8859_15
tristate "NLS ISO 8859-15 (Latin 9; Western European Languages with Euro)"
- ---help---
+ help
If you want to display filenames with native language characters
from the Microsoft FAT file system family or from JOLIET CD-ROMs
correctly on the screen, you need to include the appropriate
@@ -455,7 +455,7 @@ config NLS_KOI8_U
config NLS_MAC_ROMAN
tristate "Codepage macroman"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -470,7 +470,7 @@ config NLS_MAC_ROMAN
config NLS_MAC_CELTIC
tristate "Codepage macceltic"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -484,7 +484,7 @@ config NLS_MAC_CELTIC
config NLS_MAC_CENTEURO
tristate "Codepage maccenteuro"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -498,7 +498,7 @@ config NLS_MAC_CENTEURO
config NLS_MAC_CROATIAN
tristate "Codepage maccroatian"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -512,7 +512,7 @@ config NLS_MAC_CROATIAN
config NLS_MAC_CYRILLIC
tristate "Codepage maccyrillic"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -526,7 +526,7 @@ config NLS_MAC_CYRILLIC
config NLS_MAC_GAELIC
tristate "Codepage macgaelic"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -540,7 +540,7 @@ config NLS_MAC_GAELIC
config NLS_MAC_GREEK
tristate "Codepage macgreek"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -554,7 +554,7 @@ config NLS_MAC_GREEK
config NLS_MAC_ICELAND
tristate "Codepage maciceland"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -568,7 +568,7 @@ config NLS_MAC_ICELAND
config NLS_MAC_INUIT
tristate "Codepage macinuit"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -582,7 +582,7 @@ config NLS_MAC_INUIT
config NLS_MAC_ROMANIAN
tristate "Codepage macromanian"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
@@ -596,7 +596,7 @@ config NLS_MAC_ROMANIAN
config NLS_MAC_TURKISH
tristate "Codepage macturkish"
- ---help---
+ help
The Apple HFS file system family can deal with filenames in
native language character sets. These character sets are stored in
so-called MAC codepages. You need to include the appropriate
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 8b9103f126ad..a511f9d8677b 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -4,7 +4,7 @@ config FANOTIFY
select FSNOTIFY
select EXPORTFS
default n
- ---help---
+ help
Say Y here to enable fanotify support. fanotify is a file access
notification system which differs from inotify in that it sends
an open file descriptor to the userspace listener along with
@@ -17,7 +17,7 @@ config FANOTIFY_ACCESS_PERMISSIONS
depends on FANOTIFY
depends on SECURITY
default n
- ---help---
+ help
Say Y here is you want fanotify listeners to be able to make permissions
decisions concerning filesystem events. This is used by some fanotify
listeners which need to scan files before allowing the system access to
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 7715fadd5fff..1cc8be25df7e 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -3,7 +3,7 @@ config INOTIFY_USER
bool "Inotify support for userspace"
select FSNOTIFY
default y
- ---help---
+ help
Say Y here to enable inotify support for userspace, including the
associated system calls. Inotify allows monitoring of both files and
directories via a single open fd. Events are read from the file
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 1177c33df895..aca16624b370 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config OCFS2_FS
tristate "OCFS2 file system support"
- depends on NET && SYSFS && CONFIGFS_FS
+ depends on INET && SYSFS && CONFIGFS_FS
select JBD2
select CRC32
select QUOTA
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3a44e461828a..25cabbfe87fc 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -62,7 +62,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
last_index = (size - 1) >> PAGE_SHIFT;
/*
- * There are cases that lead to the page no longer bebongs to the
+ * There are cases that lead to the page no longer belonging to the
* mapping.
* 1) pagecache truncates locally due to memory pressure.
* 2) pagecache truncates when another is taking EX lock against
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 9709cf22cab3..79dd052c7dbf 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -47,7 +47,7 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
- int uninitialized_var(error);
+ int error = 0;
size_t slen;
if (!(old->d_inode->i_opflags & IOP_XATTR) ||
@@ -584,9 +584,10 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
.link = c->link
};
- err = ovl_lock_rename_workdir(c->workdir, c->destdir);
- if (err)
- return err;
+ /* workdir and destdir could be the same when copying up to indexdir */
+ err = -EIO;
+ if (lock_rename(c->workdir, c->destdir) != NULL)
+ goto unlock;
err = ovl_prep_cu_creds(c->dentry, &cc);
if (err)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 279009dee366..1bba4813f9cb 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -62,35 +62,59 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir)
}
/* caller holds i_mutex on workdir */
-static struct dentry *ovl_whiteout(struct dentry *workdir)
+static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
{
int err;
struct dentry *whiteout;
+ struct dentry *workdir = ofs->workdir;
struct inode *wdir = workdir->d_inode;
- whiteout = ovl_lookup_temp(workdir);
- if (IS_ERR(whiteout))
- return whiteout;
+ if (!ofs->whiteout) {
+ whiteout = ovl_lookup_temp(workdir);
+ if (IS_ERR(whiteout))
+ goto out;
- err = ovl_do_whiteout(wdir, whiteout);
- if (err) {
- dput(whiteout);
- whiteout = ERR_PTR(err);
+ err = ovl_do_whiteout(wdir, whiteout);
+ if (err) {
+ dput(whiteout);
+ whiteout = ERR_PTR(err);
+ goto out;
+ }
+ ofs->whiteout = whiteout;
}
+ if (ofs->share_whiteout) {
+ whiteout = ovl_lookup_temp(workdir);
+ if (IS_ERR(whiteout))
+ goto out;
+
+ err = ovl_do_link(ofs->whiteout, wdir, whiteout);
+ if (!err)
+ goto out;
+
+ if (err != -EMLINK) {
+ pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%i)\n",
+ ofs->whiteout->d_inode->i_nlink, err);
+ ofs->share_whiteout = false;
+ }
+ dput(whiteout);
+ }
+ whiteout = ofs->whiteout;
+ ofs->whiteout = NULL;
+out:
return whiteout;
}
/* Caller must hold i_mutex on both workdir and dir */
-int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir,
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
struct dentry *dentry)
{
- struct inode *wdir = workdir->d_inode;
+ struct inode *wdir = ofs->workdir->d_inode;
struct dentry *whiteout;
int err;
int flags = 0;
- whiteout = ovl_whiteout(workdir);
+ whiteout = ovl_whiteout(ofs);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
return err;
@@ -262,6 +286,8 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
inode = ovl_get_inode(dentry->d_sb, &oip);
if (IS_ERR(inode))
return PTR_ERR(inode);
+ if (inode == oip.newinode)
+ ovl_set_flag(OVL_UPPERDATA, inode);
} else {
WARN_ON(ovl_inode_real(inode) != d_inode(newdentry));
dput(newdentry);
@@ -715,6 +741,7 @@ static bool ovl_matches_upper(struct dentry *dentry, struct dentry *upper)
static int ovl_remove_and_whiteout(struct dentry *dentry,
struct list_head *list)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct dentry *upper;
@@ -748,7 +775,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
goto out_dput_upper;
}
- err = ovl_cleanup_and_whiteout(workdir, d_inode(upperdir), upper);
+ err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper);
if (err)
goto out_d_drop;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index ed5c1078919c..8f4286450f92 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -204,7 +204,7 @@ static int ovl_check_encode_origin(struct dentry *dentry)
* ovl_connect_layer() will try to make origin's layer "connected" by
* copying up a "connectable" ancestor.
*/
- if (d_is_dir(dentry) && ofs->upper_mnt)
+ if (d_is_dir(dentry) && ovl_upper_mnt(ofs))
return ovl_connect_layer(dentry);
/* Lower file handle for indexed and non-upper dir/non-dir */
@@ -231,12 +231,9 @@ static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen)
if (IS_ERR(fh))
return PTR_ERR(fh);
- err = -EOVERFLOW;
len = OVL_FH_LEN(fh);
- if (len > buflen)
- goto fail;
-
- memcpy(fid, fh, len);
+ if (len <= buflen)
+ memcpy(fid, fh, len);
err = len;
out:
@@ -244,9 +241,8 @@ out:
return err;
fail:
- pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n",
- dentry, err, buflen, fh ? (int)fh->fb.len : 0,
- fh ? fh->fb.type : 0);
+ pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n",
+ dentry, err);
goto out;
}
@@ -254,7 +250,7 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
struct inode *parent)
{
struct dentry *dentry;
- int bytes = *max_len << 2;
+ int bytes, buflen = *max_len << 2;
/* TODO: encode connectable file handles */
if (parent)
@@ -264,12 +260,14 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
if (WARN_ON(!dentry))
return FILEID_INVALID;
- bytes = ovl_dentry_to_fid(dentry, fid, bytes);
+ bytes = ovl_dentry_to_fid(dentry, fid, buflen);
dput(dentry);
if (bytes <= 0)
return FILEID_INVALID;
*max_len = bytes >> 2;
+ if (bytes > buflen)
+ return FILEID_INVALID;
return OVL_FILEID_V1;
}
@@ -679,10 +677,10 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
struct dentry *dentry;
struct dentry *upper;
- if (!ofs->upper_mnt)
+ if (!ovl_upper_mnt(ofs))
return ERR_PTR(-EACCES);
- upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true);
+ upper = ovl_decode_real_fh(fh, ovl_upper_mnt(ofs), true);
if (IS_ERR_OR_NULL(upper))
return upper;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 87c362f65448..01820e654a21 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -10,6 +10,7 @@
#include <linux/uio.h>
#include <linux/uaccess.h>
#include <linux/splice.h>
+#include <linux/security.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include "overlayfs.h"
@@ -39,10 +40,22 @@ static struct file *ovl_open_realfile(const struct file *file,
struct file *realfile;
const struct cred *old_cred;
int flags = file->f_flags | O_NOATIME | FMODE_NONOTIFY;
+ int acc_mode = ACC_MODE(flags);
+ int err;
+
+ if (flags & O_APPEND)
+ acc_mode |= MAY_APPEND;
old_cred = ovl_override_creds(inode->i_sb);
- realfile = open_with_fake_path(&file->f_path, flags, realinode,
- current_cred());
+ err = inode_permission(realinode, MAY_OPEN | acc_mode);
+ if (err) {
+ realfile = ERR_PTR(err);
+ } else if (!inode_owner_or_capable(realinode)) {
+ realfile = ERR_PTR(-EPERM);
+ } else {
+ realfile = open_with_fake_path(&file->f_path, flags, realinode,
+ current_cred());
+ }
revert_creds(old_cred);
pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
@@ -219,9 +232,8 @@ static void ovl_file_accessed(struct file *file)
touch_atime(&file->f_path);
}
-static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb)
+static rwf_t ovl_iocb_to_rwf(int ifl)
{
- int ifl = iocb->ki_flags;
rwf_t flags = 0;
if (ifl & IOCB_NOWAIT)
@@ -283,7 +295,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(iocb));
+ ovl_iocb_to_rwf(iocb->ki_flags));
} else {
struct ovl_aio_req *aio_req;
@@ -336,7 +348,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (is_sync_kiocb(iocb)) {
file_start_write(real.file);
ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(iocb));
+ ovl_iocb_to_rwf(iocb->ki_flags));
file_end_write(real.file);
/* Update size */
ovl_copyattr(ovl_inode_real(inode), inode);
@@ -520,7 +532,9 @@ static long ovl_real_ioctl(struct file *file, unsigned int cmd,
return ret;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_ioctl(real.file, cmd, arg);
+ ret = security_file_ioctl(real.file, cmd, arg);
+ if (!ret)
+ ret = vfs_ioctl(real.file, cmd, arg);
revert_creds(old_cred);
fdput(real);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 7af76b9004eb..8be6cd264f66 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -457,7 +457,7 @@ int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
if (flags & S_ATIME) {
struct ovl_fs *ofs = inode->i_sb->s_fs_info;
struct path upperpath = {
- .mnt = ofs->upper_mnt,
+ .mnt = ovl_upper_mnt(ofs),
.dentry = ovl_upperdentry_dereference(OVL_I(inode)),
};
@@ -905,7 +905,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir)
* Does overlay inode need to be hashed by lower inode?
*/
static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
- struct dentry *lower, struct dentry *index)
+ struct dentry *lower, bool index)
{
struct ovl_fs *ofs = sb->s_fs_info;
@@ -918,7 +918,7 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
return true;
/* Yes, if won't be copied up */
- if (!ofs->upper_mnt)
+ if (!ovl_upper_mnt(ofs))
return true;
/* No, if lower hardlink is or will be broken on copy up */
@@ -954,7 +954,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
oip->index);
int fsid = bylower ? lowerpath->layer->fsid : 0;
- bool is_dir, metacopy = false;
+ bool is_dir;
unsigned long ino = 0;
int err = oip->newinode ? -EEXIST : -ENOMEM;
@@ -1015,15 +1015,6 @@ struct inode *ovl_get_inode(struct super_block *sb,
if (oip->index)
ovl_set_flag(OVL_INDEX, inode);
- if (upperdentry) {
- err = ovl_check_metacopy_xattr(upperdentry);
- if (err < 0)
- goto out_err;
- metacopy = err;
- if (!metacopy)
- ovl_set_flag(OVL_UPPERDATA, inode);
- }
-
OVL_I(inode)->redirect = oip->redirect;
if (bylower)
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 0db23baf98e7..3566282a9199 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -191,16 +191,36 @@ static bool ovl_is_opaquedir(struct dentry *dentry)
return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE);
}
+static struct dentry *ovl_lookup_positive_unlocked(const char *name,
+ struct dentry *base, int len,
+ bool drop_negative)
+{
+ struct dentry *ret = lookup_one_len_unlocked(name, base, len);
+
+ if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
+ if (drop_negative && ret->d_lockref.count == 1) {
+ spin_lock(&ret->d_lock);
+ /* Recheck condition under lock */
+ if (d_is_negative(ret) && ret->d_lockref.count == 1)
+ __d_drop(ret);
+ spin_unlock(&ret->d_lock);
+ }
+ dput(ret);
+ ret = ERR_PTR(-ENOENT);
+ }
+ return ret;
+}
+
static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
const char *name, unsigned int namelen,
size_t prelen, const char *post,
- struct dentry **ret)
+ struct dentry **ret, bool drop_negative)
{
struct dentry *this;
int err;
bool last_element = !post[0];
- this = lookup_positive_unlocked(name, base, namelen);
+ this = ovl_lookup_positive_unlocked(name, base, namelen, drop_negative);
if (IS_ERR(this)) {
err = PTR_ERR(this);
this = NULL;
@@ -276,7 +296,7 @@ out_err:
}
static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
- struct dentry **ret)
+ struct dentry **ret, bool drop_negative)
{
/* Counting down from the end, since the prefix can change */
size_t rem = d->name.len - 1;
@@ -285,7 +305,7 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
if (d->name.name[0] != '/')
return ovl_lookup_single(base, d, d->name.name, d->name.len,
- 0, "", ret);
+ 0, "", ret, drop_negative);
while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) {
const char *s = d->name.name + d->name.len - rem;
@@ -298,7 +318,8 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
return -EIO;
err = ovl_lookup_single(base, d, s, thislen,
- d->name.len - rem, next, &base);
+ d->name.len - rem, next, &base,
+ drop_negative);
dput(dentry);
if (err)
return err;
@@ -468,7 +489,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
if (IS_ERR_OR_NULL(fh))
return ERR_CAST(fh);
- upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true);
+ upper = ovl_decode_real_fh(fh, ovl_upper_mnt(ofs), true);
kfree(fh);
if (IS_ERR_OR_NULL(upper))
@@ -484,12 +505,6 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
return upper;
}
-/* Is this a leftover from create/whiteout of directory index entry? */
-static bool ovl_is_temp_index(struct dentry *index)
-{
- return index->d_name.name[0] == '#';
-}
-
/*
* Verify that an index entry name matches the origin file handle stored in
* OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path.
@@ -507,11 +522,6 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
if (!d_inode(index))
return 0;
- /* Cleanup leftover from index create/cleanup attempt */
- err = -ESTALE;
- if (ovl_is_temp_index(index))
- goto fail;
-
err = -EINVAL;
if (index->d_name.len < sizeof(struct ovl_fb)*2)
goto fail;
@@ -823,7 +833,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *this;
unsigned int i;
int err;
- bool metacopy = false;
+ bool uppermetacopy = false;
struct ovl_lookup_data d = {
.sb = dentry->d_sb,
.name = dentry->d_name,
@@ -841,7 +851,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
old_cred = ovl_override_creds(dentry->d_sb);
upperdir = ovl_dentry_upper(dentry->d_parent);
if (upperdir) {
- err = ovl_lookup_layer(upperdir, &d, &upperdentry);
+ err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
if (err)
goto out;
@@ -869,7 +879,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out_put_upper;
if (d.metacopy)
- metacopy = true;
+ uppermetacopy = true;
}
if (d.redirect) {
@@ -899,13 +909,19 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
else
d.last = lower.layer->idx == roe->numlower;
- err = ovl_lookup_layer(lower.dentry, &d, &this);
+ err = ovl_lookup_layer(lower.dentry, &d, &this, false);
if (err)
goto out_put;
if (!this)
continue;
+ if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) {
+ err = -EPERM;
+ pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry);
+ goto out_put;
+ }
+
/*
* If no origin fh is stored in upper of a merge dir, store fh
* of lower dir and set upper parent "impure".
@@ -940,21 +956,21 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
origin = this;
}
- if (d.metacopy)
- metacopy = true;
- /*
- * Do not store intermediate metacopy dentries in chain,
- * except top most lower metacopy dentry
- */
if (d.metacopy && ctr) {
+ /*
+ * Do not store intermediate metacopy dentries in
+ * lower chain, except top most lower metacopy dentry.
+ * Continue the loop so that if there is an absolute
+ * redirect on this dentry, poe can be reset to roe.
+ */
dput(this);
- continue;
+ this = NULL;
+ } else {
+ stack[ctr].dentry = this;
+ stack[ctr].layer = lower.layer;
+ ctr++;
}
- stack[ctr].dentry = this;
- stack[ctr].layer = lower.layer;
- ctr++;
-
/*
* Following redirects can have security consequences: it's like
* a symlink into the lower layer without the permission checks.
@@ -982,22 +998,17 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
}
}
- if (metacopy) {
- /*
- * Found a metacopy dentry but did not find corresponding
- * data dentry
- */
- if (d.metacopy) {
- err = -EIO;
- goto out_put;
- }
-
- err = -EPERM;
- if (!ofs->config.metacopy) {
- pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n",
- dentry);
- goto out_put;
- }
+ /*
+ * For regular non-metacopy upper dentries, there is no lower
+ * path based lookup, hence ctr will be zero. If a dentry is found
+ * using ORIGIN xattr on upper, install it in stack.
+ *
+ * For metacopy dentry, path based lookup will find lower dentries.
+ * Just make sure a corresponding data dentry has been found.
+ */
+ if (d.metacopy || (uppermetacopy && !ctr)) {
+ err = -EIO;
+ goto out_put;
} else if (!d.is_dir && upperdentry && !ctr && origin_path) {
if (WARN_ON(stack != NULL)) {
err = -EIO;
@@ -1005,25 +1016,30 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
}
stack = origin_path;
ctr = 1;
+ origin = origin_path->dentry;
origin_path = NULL;
}
/*
- * Lookup index by lower inode and verify it matches upper inode.
- * We only trust dir index if we verified that lower dir matches
- * origin, otherwise dir index entries may be inconsistent and we
- * ignore them.
+ * Always lookup index if there is no-upperdentry.
+ *
+ * For the case of upperdentry, we have set origin by now if it
+ * needed to be set. There are basically three cases.
+ *
+ * For directories, lookup index by lower inode and verify it matches
+ * upper inode. We only trust dir index if we verified that lower dir
+ * matches origin, otherwise dir index entries may be inconsistent
+ * and we ignore them.
+ *
+ * For regular upper, we already set origin if upper had ORIGIN
+ * xattr. There is no verification though as there is no path
+ * based dentry lookup in lower in this case.
*
- * For non-dir upper metacopy dentry, we already set "origin" if we
- * verified that lower matched upper origin. If upper origin was
- * not present (because lower layer did not support fh encode/decode),
- * or indexing is not enabled, do not set "origin" and skip looking up
- * index. This case should be handled in same way as a non-dir upper
- * without ORIGIN is handled.
+ * For metacopy upper, we set a verified origin already if index
+ * is enabled and if upper had an ORIGIN xattr.
*
- * Always lookup index of non-dir non-metacopy and non-upper.
*/
- if (ctr && (!upperdentry || (!d.is_dir && !metacopy)))
+ if (!upperdentry && ctr)
origin = stack[0].dentry;
if (origin && ovl_indexdir(dentry->d_sb) &&
@@ -1074,6 +1090,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_free_oe;
+ if (upperdentry && !uppermetacopy)
+ ovl_set_flag(OVL_UPPERDATA, inode);
}
ovl_dentry_update_reval(dentry, upperdentry,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e6f3670146ed..b725c7f15ff4 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -355,6 +355,9 @@ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len);
static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len)
{
+ if (fh_len < sizeof(struct ovl_fh))
+ return -EINVAL;
+
return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET);
}
@@ -394,8 +397,8 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
void ovl_dir_cache_free(struct inode *inode);
int ovl_check_d_type_supported(struct path *realpath);
-void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
- struct dentry *dentry, int level);
+int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
+ struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
/* inode.c */
@@ -421,7 +424,7 @@ struct ovl_inode_params {
struct inode *newinode;
struct dentry *upperdentry;
struct ovl_path *lowerpath;
- struct dentry *index;
+ bool index;
unsigned int numlower;
char *redirect;
struct dentry *lowerdata;
@@ -455,7 +458,7 @@ static inline void ovl_copyflags(struct inode *from, struct inode *to)
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
-int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir,
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
struct dentry *dentry);
struct ovl_cattr {
dev_t rdev;
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 5762d802fe01..b429c80879ee 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -46,7 +46,6 @@ struct ovl_path {
/* private information held for overlayfs's superblock */
struct ovl_fs {
- struct vfsmount *upper_mnt;
unsigned int numlayer;
/* Number of unique fs among layers including upper fs */
unsigned int numfs;
@@ -68,8 +67,8 @@ struct ovl_fs {
/* Did we take the inuse lock? */
bool upperdir_locked;
bool workdir_locked;
+ bool share_whiteout;
/* Traps in ovl inode cache */
- struct inode *upperdir_trap;
struct inode *workbasedir_trap;
struct inode *workdir_trap;
struct inode *indexdir_trap;
@@ -77,8 +76,15 @@ struct ovl_fs {
int xino_mode;
/* For allocation of non-persistent inode numbers */
atomic_long_t last_ino;
+ /* Whiteout dentry cache */
+ struct dentry *whiteout;
};
+static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs)
+{
+ return ofs->layers[0].mnt;
+}
+
static inline struct ovl_fs *OVL_FS(struct super_block *sb)
{
return (struct ovl_fs *)sb->s_fs_info;
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index e452ff7d583d..6918b98faeb6 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -297,7 +297,7 @@ static inline int ovl_dir_read(struct path *realpath,
struct file *realfile;
int err;
- realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
+ realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
@@ -743,8 +743,10 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
+ const struct cred *old_cred;
int err;
+ old_cred = ovl_override_creds(dentry->d_sb);
if (!ctx->pos)
ovl_dir_reset(file);
@@ -758,17 +760,20 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
(ovl_same_fs(dentry->d_sb) &&
(ovl_is_impure_dir(file) ||
OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
- return ovl_iterate_real(file, ctx);
+ err = ovl_iterate_real(file, ctx);
+ } else {
+ err = iterate_dir(od->realfile, ctx);
}
- return iterate_dir(od->realfile, ctx);
+ goto out;
}
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
+ err = PTR_ERR(cache);
if (IS_ERR(cache))
- return PTR_ERR(cache);
+ goto out;
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
@@ -780,7 +785,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
if (!p->ino) {
err = ovl_cache_update_ino(&file->f_path, p);
if (err)
- return err;
+ goto out;
}
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
@@ -788,7 +793,10 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
od->cursor = p->l_node.next;
ctx->pos++;
}
- return 0;
+ err = 0;
+out:
+ revert_creds(old_cred);
+ return err;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
@@ -831,6 +839,19 @@ out_unlock:
return res;
}
+static struct file *ovl_dir_open_realfile(struct file *file,
+ struct path *realpath)
+{
+ struct file *res;
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE));
+ revert_creds(old_cred);
+
+ return res;
+}
+
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
@@ -853,7 +874,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
- realfile = ovl_path_open(&upperpath, O_RDONLY);
+ realfile = ovl_dir_open_realfile(file, &upperpath);
inode_lock(inode);
if (!od->upperfile) {
@@ -904,7 +925,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
- realfile = ovl_path_open(&realpath, file->f_flags);
+ realfile = ovl_dir_open_realfile(file, &realpath);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
@@ -1071,14 +1092,13 @@ out:
ovl_cache_free(&list);
}
-void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
+int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
struct dentry *dentry, int level)
{
int err;
if (!d_is_dir(dentry) || level > 1) {
- ovl_cleanup(dir, dentry);
- return;
+ return ovl_cleanup(dir, dentry);
}
err = ovl_do_rmdir(dir, dentry);
@@ -1088,8 +1108,10 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
inode_unlock(dir);
ovl_workdir_cleanup_recurse(&path, level + 1);
inode_lock_nested(dir, I_MUTEX_PARENT);
- ovl_cleanup(dir, dentry);
+ err = ovl_cleanup(dir, dentry);
}
+
+ return err;
}
int ovl_indexdir_cleanup(struct ovl_fs *ofs)
@@ -1098,7 +1120,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
struct dentry *indexdir = ofs->indexdir;
struct dentry *index = NULL;
struct inode *dir = indexdir->d_inode;
- struct path path = { .mnt = ofs->upper_mnt, .dentry = indexdir };
+ struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
LIST_HEAD(list);
struct rb_root root = RB_ROOT;
struct ovl_cache_entry *p;
@@ -1128,6 +1150,13 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
index = NULL;
break;
}
+ /* Cleanup leftover from index create/cleanup attempt */
+ if (index->d_name.name[0] == '#') {
+ err = ovl_workdir_cleanup(dir, path.mnt, index, 1);
+ if (err)
+ break;
+ goto next;
+ }
err = ovl_verify_index(ofs, index);
if (!err) {
goto next;
@@ -1146,7 +1175,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
* Whiteout orphan index to block future open by
* handle after overlay nlink dropped to zero.
*/
- err = ovl_cleanup_and_whiteout(indexdir, dir, index);
+ err = ovl_cleanup_and_whiteout(ofs, dir, index);
} else {
/* Cleanup orphan index entries */
err = ovl_cleanup(dir, index);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 732ad5495c92..91476bc422f9 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -211,24 +211,28 @@ static void ovl_destroy_inode(struct inode *inode)
static void ovl_free_fs(struct ovl_fs *ofs)
{
+ struct vfsmount **mounts;
unsigned i;
iput(ofs->workbasedir_trap);
iput(ofs->indexdir_trap);
iput(ofs->workdir_trap);
- iput(ofs->upperdir_trap);
+ dput(ofs->whiteout);
dput(ofs->indexdir);
dput(ofs->workdir);
if (ofs->workdir_locked)
ovl_inuse_unlock(ofs->workbasedir);
dput(ofs->workbasedir);
if (ofs->upperdir_locked)
- ovl_inuse_unlock(ofs->upper_mnt->mnt_root);
- mntput(ofs->upper_mnt);
- for (i = 1; i < ofs->numlayer; i++) {
+ ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
+
+ /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */
+ mounts = (struct vfsmount **) ofs->layers;
+ for (i = 0; i < ofs->numlayer; i++) {
iput(ofs->layers[i].trap);
- mntput(ofs->layers[i].mnt);
+ mounts[i] = ofs->layers[i].mnt;
}
+ kern_unmount_array(mounts, ofs->numlayer);
kfree(ofs->layers);
for (i = 0; i < ofs->numfs; i++)
free_anon_bdev(ofs->fs[i].pseudo_dev);
@@ -257,12 +261,12 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
struct super_block *upper_sb;
int ret;
- if (!ofs->upper_mnt)
+ if (!ovl_upper_mnt(ofs))
return 0;
/*
- * If this is a sync(2) call or an emergency sync, all the super blocks
- * will be iterated, including upper_sb, so no need to do anything.
+ * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC).
+ * All the super blocks will be iterated, including upper_sb.
*
* If this is a syncfs(2) call, then we do need to call
* sync_filesystem() on upper_sb, but enough if we do it when being
@@ -271,7 +275,7 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
if (!wait)
return 0;
- upper_sb = ofs->upper_mnt->mnt_sb;
+ upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
down_read(&upper_sb->s_umount);
ret = sync_filesystem(upper_sb);
@@ -309,7 +313,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
/* Will this overlay be forced to mount/remount ro? */
static bool ovl_force_readonly(struct ovl_fs *ofs)
{
- return (!ofs->upper_mnt || !ofs->workdir);
+ return (!ovl_upper_mnt(ofs) || !ofs->workdir);
}
static const char *ovl_redirect_mode_def(void)
@@ -364,11 +368,20 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
static int ovl_remount(struct super_block *sb, int *flags, char *data)
{
struct ovl_fs *ofs = sb->s_fs_info;
+ struct super_block *upper_sb;
+ int ret = 0;
if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs))
return -EROFS;
- return 0;
+ if (*flags & SB_RDONLY && !sb_rdonly(sb)) {
+ upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
+ down_read(&upper_sb->s_umount);
+ ret = sync_filesystem(upper_sb);
+ up_read(&upper_sb->s_umount);
+ }
+
+ return ret;
}
static const struct super_operations ovl_super_operations = {
@@ -470,6 +483,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
char *p;
int err;
bool metacopy_opt = false, redirect_opt = false;
+ bool nfs_export_opt = false, index_opt = false;
config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL);
if (!config->redirect_mode)
@@ -519,18 +533,22 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
case OPT_INDEX_ON:
config->index = true;
+ index_opt = true;
break;
case OPT_INDEX_OFF:
config->index = false;
+ index_opt = true;
break;
case OPT_NFS_EXPORT_ON:
config->nfs_export = true;
+ nfs_export_opt = true;
break;
case OPT_NFS_EXPORT_OFF:
config->nfs_export = false;
+ nfs_export_opt = true;
break;
case OPT_XINO_ON:
@@ -552,6 +570,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
case OPT_METACOPY_OFF:
config->metacopy = false;
+ metacopy_opt = true;
break;
default:
@@ -601,6 +620,48 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
}
}
+ /* Resolve nfs_export -> index dependency */
+ if (config->nfs_export && !config->index) {
+ if (nfs_export_opt && index_opt) {
+ pr_err("conflicting options: nfs_export=on,index=off\n");
+ return -EINVAL;
+ }
+ if (index_opt) {
+ /*
+ * There was an explicit index=off that resulted
+ * in this conflict.
+ */
+ pr_info("disabling nfs_export due to index=off\n");
+ config->nfs_export = false;
+ } else {
+ /* Automatically enable index otherwise. */
+ config->index = true;
+ }
+ }
+
+ /* Resolve nfs_export -> !metacopy dependency */
+ if (config->nfs_export && config->metacopy) {
+ if (nfs_export_opt && metacopy_opt) {
+ pr_err("conflicting options: nfs_export=on,metacopy=on\n");
+ return -EINVAL;
+ }
+ if (metacopy_opt) {
+ /*
+ * There was an explicit metacopy=on that resulted
+ * in this conflict.
+ */
+ pr_info("disabling nfs_export due to metacopy=on\n");
+ config->nfs_export = false;
+ } else {
+ /*
+ * There was an explicit nfs_export=on that resulted
+ * in this conflict.
+ */
+ pr_info("disabling metacopy due to nfs_export=on\n");
+ config->metacopy = false;
+ }
+ }
+
return 0;
}
@@ -611,15 +672,12 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
const char *name, bool persist)
{
struct inode *dir = ofs->workbasedir->d_inode;
- struct vfsmount *mnt = ofs->upper_mnt;
+ struct vfsmount *mnt = ovl_upper_mnt(ofs);
struct dentry *work;
int err;
bool retried = false;
- bool locked = false;
inode_lock_nested(dir, I_MUTEX_PARENT);
- locked = true;
-
retry:
work = lookup_one_len(name, ofs->workbasedir, strlen(name));
@@ -680,9 +738,7 @@ retry:
goto out_err;
}
out_unlock:
- if (locked)
- inode_unlock(dir);
-
+ inode_unlock(dir);
return work;
out_dput:
@@ -779,11 +835,11 @@ static int ovl_lower_dir(const char *name, struct path *path,
err = ovl_mount_dir_noesc(name, path);
if (err)
- goto out;
+ return err;
err = ovl_check_namelen(path, ofs, name);
if (err)
- goto out_put;
+ return err;
*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
@@ -805,11 +861,6 @@ static int ovl_lower_dir(const char *name, struct path *path,
ofs->xino_mode = -1;
return 0;
-
-out_put:
- path_put_init(path);
-out:
- return err;
}
/* Workdir should not be subdir of upperdir and vice versa */
@@ -1016,7 +1067,7 @@ static int ovl_report_in_use(struct ovl_fs *ofs, const char *name)
}
static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
- struct path *upperpath)
+ struct ovl_layer *upper_layer, struct path *upperpath)
{
struct vfsmount *upper_mnt;
int err;
@@ -1036,7 +1087,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
if (err)
goto out;
- err = ovl_setup_trap(sb, upperpath->dentry, &ofs->upperdir_trap,
+ err = ovl_setup_trap(sb, upperpath->dentry, &upper_layer->trap,
"upperdir");
if (err)
goto out;
@@ -1050,9 +1101,23 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
/* Don't inherit atime flags */
upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
- ofs->upper_mnt = upper_mnt;
+ upper_layer->mnt = upper_mnt;
+ upper_layer->idx = 0;
+ upper_layer->fsid = 0;
- if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) {
+ /*
+ * Inherit SB_NOSEC flag from upperdir.
+ *
+ * This optimization changes behavior when a security related attribute
+ * (suid/sgid/security.*) is changed on an underlying layer. This is
+ * okay because we don't yet have guarantees in that case, but it will
+ * need careful treatment once we want to honour changes to underlying
+ * filesystems.
+ */
+ if (upper_mnt->mnt_sb->s_flags & SB_NOSEC)
+ sb->s_flags |= SB_NOSEC;
+
+ if (ovl_inuse_trylock(ovl_upper_mnt(ofs)->mnt_root)) {
ofs->upperdir_locked = true;
} else {
err = ovl_report_in_use(ofs, "upperdir");
@@ -1128,7 +1193,7 @@ out_unlock:
static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
struct path *workpath)
{
- struct vfsmount *mnt = ofs->upper_mnt;
+ struct vfsmount *mnt = ovl_upper_mnt(ofs);
struct dentry *temp;
bool rename_whiteout;
bool d_type;
@@ -1272,7 +1337,7 @@ out:
static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
struct ovl_entry *oe, struct path *upperpath)
{
- struct vfsmount *mnt = ofs->upper_mnt;
+ struct vfsmount *mnt = ovl_upper_mnt(ofs);
int err;
err = mnt_want_write(mnt);
@@ -1328,7 +1393,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
{
unsigned int i;
- if (!ofs->config.nfs_export && !ofs->upper_mnt)
+ if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs))
return true;
for (i = 0; i < ofs->numfs; i++) {
@@ -1388,18 +1453,13 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
}
static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
- struct path *stack, unsigned int numlower)
+ struct path *stack, unsigned int numlower,
+ struct ovl_layer *layers)
{
int err;
unsigned int i;
- struct ovl_layer *layers;
err = -ENOMEM;
- layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL);
- if (!layers)
- goto out;
- ofs->layers = layers;
-
ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL);
if (ofs->fs == NULL)
goto out;
@@ -1407,11 +1467,6 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
/* idx/fsid 0 are reserved for upper fs even with lower only overlay */
ofs->numfs++;
- layers[0].mnt = ofs->upper_mnt;
- layers[0].idx = 0;
- layers[0].fsid = 0;
- ofs->numlayer = 1;
-
/*
* All lower layers that share the same fs as upper layer, use the same
* pseudo_dev as upper layer. Allocate fs[0].pseudo_dev even for lower
@@ -1424,8 +1479,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
goto out;
}
- if (ofs->upper_mnt) {
- ofs->fs[0].sb = ofs->upper_mnt->mnt_sb;
+ if (ovl_upper_mnt(ofs)) {
+ ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb;
ofs->fs[0].is_lower = false;
}
@@ -1480,7 +1535,7 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
* inode number or a non persistent inode number allocated from a
* dedicated range.
*/
- if (ofs->numfs - !ofs->upper_mnt == 1) {
+ if (ofs->numfs - !ovl_upper_mnt(ofs) == 1) {
if (ofs->config.xino == OVL_XINO_ON)
pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n");
ofs->xino_mode = 0;
@@ -1509,44 +1564,30 @@ out:
}
static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
- struct ovl_fs *ofs)
+ const char *lower, unsigned int numlower,
+ struct ovl_fs *ofs, struct ovl_layer *layers)
{
int err;
- char *lowertmp, *lower;
struct path *stack = NULL;
- unsigned int stacklen, numlower = 0, i;
+ unsigned int i;
struct ovl_entry *oe;
- err = -ENOMEM;
- lowertmp = kstrdup(ofs->config.lowerdir, GFP_KERNEL);
- if (!lowertmp)
- goto out_err;
-
- err = -EINVAL;
- stacklen = ovl_split_lowerdirs(lowertmp);
- if (stacklen > OVL_MAX_STACK) {
- pr_err("too many lower directories, limit is %d\n",
- OVL_MAX_STACK);
- goto out_err;
- } else if (!ofs->config.upperdir && stacklen == 1) {
+ if (!ofs->config.upperdir && numlower == 1) {
pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n");
- goto out_err;
+ return ERR_PTR(-EINVAL);
} else if (!ofs->config.upperdir && ofs->config.nfs_export &&
ofs->config.redirect_follow) {
pr_warn("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n");
ofs->config.nfs_export = false;
}
- err = -ENOMEM;
- stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
+ stack = kcalloc(numlower, sizeof(struct path), GFP_KERNEL);
if (!stack)
- goto out_err;
+ return ERR_PTR(-ENOMEM);
err = -EINVAL;
- lower = lowertmp;
- for (numlower = 0; numlower < stacklen; numlower++) {
- err = ovl_lower_dir(lower, &stack[numlower], ofs,
- &sb->s_stack_depth);
+ for (i = 0; i < numlower; i++) {
+ err = ovl_lower_dir(lower, &stack[i], ofs, &sb->s_stack_depth);
if (err)
goto out_err;
@@ -1560,7 +1601,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
goto out_err;
}
- err = ovl_get_layers(sb, ofs, stack, numlower);
+ err = ovl_get_layers(sb, ofs, stack, numlower, layers);
if (err)
goto out_err;
@@ -1578,7 +1619,6 @@ out:
for (i = 0; i < numlower; i++)
path_put(&stack[i]);
kfree(stack);
- kfree(lowertmp);
return oe;
@@ -1629,8 +1669,8 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
{
int i, err;
- if (ofs->upper_mnt) {
- err = ovl_check_layer(sb, ofs, ofs->upper_mnt->mnt_root,
+ if (ovl_upper_mnt(ofs)) {
+ err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root,
"upperdir");
if (err)
return err;
@@ -1702,7 +1742,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
struct dentry *root_dentry;
struct ovl_entry *oe;
struct ovl_fs *ofs;
+ struct ovl_layer *layers;
struct cred *cred;
+ char *splitlower = NULL;
+ unsigned int numlower;
int err;
sb->s_d_op = &ovl_dentry_operations;
@@ -1716,6 +1759,9 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!cred)
goto out_err;
+ /* Is there a reason anyone would want not to share whiteouts? */
+ ofs->share_whiteout = true;
+
ofs->config.index = ovl_index_def;
ofs->config.nfs_export = ovl_nfs_export_def;
ofs->config.xino = ovl_xino_def();
@@ -1731,6 +1777,26 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
goto out_err;
}
+ err = -ENOMEM;
+ splitlower = kstrdup(ofs->config.lowerdir, GFP_KERNEL);
+ if (!splitlower)
+ goto out_err;
+
+ numlower = ovl_split_lowerdirs(splitlower);
+ if (numlower > OVL_MAX_STACK) {
+ pr_err("too many lower directories, limit is %d\n",
+ OVL_MAX_STACK);
+ goto out_err;
+ }
+
+ layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL);
+ if (!layers)
+ goto out_err;
+
+ ofs->layers = layers;
+ /* Layer 0 is reserved for upper even if there's no upper */
+ ofs->numlayer = 1;
+
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
atomic_long_set(&ofs->last_ino, 1);
@@ -1752,7 +1818,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
goto out_err;
}
- err = ovl_get_upper(sb, ofs, &upperpath);
+ err = ovl_get_upper(sb, ofs, &layers[0], &upperpath);
if (err)
goto out_err;
@@ -1763,31 +1829,35 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!ofs->workdir)
sb->s_flags |= SB_RDONLY;
- sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth;
- sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran;
+ sb->s_stack_depth = ovl_upper_mnt(ofs)->mnt_sb->s_stack_depth;
+ sb->s_time_gran = ovl_upper_mnt(ofs)->mnt_sb->s_time_gran;
}
- oe = ovl_get_lowerstack(sb, ofs);
+ oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers);
err = PTR_ERR(oe);
if (IS_ERR(oe))
goto out_err;
/* If the upper fs is nonexistent, we mark overlayfs r/o too */
- if (!ofs->upper_mnt)
+ if (!ovl_upper_mnt(ofs))
sb->s_flags |= SB_RDONLY;
if (!(ovl_force_readonly(ofs)) && ofs->config.index) {
+ /* index dir will act also as workdir */
+ dput(ofs->workdir);
+ ofs->workdir = NULL;
+ iput(ofs->workdir_trap);
+ ofs->workdir_trap = NULL;
+
err = ovl_get_indexdir(sb, ofs, oe, &upperpath);
if (err)
goto out_free_oe;
/* Force r/o mount with no index dir */
- if (!ofs->indexdir) {
- dput(ofs->workdir);
- ofs->workdir = NULL;
+ if (ofs->indexdir)
+ ofs->workdir = dget(ofs->indexdir);
+ else
sb->s_flags |= SB_RDONLY;
- }
-
}
err = ovl_check_overlapping_layers(sb, ofs);
@@ -1797,7 +1867,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
/* Show index=off in /proc/mounts for forced r/o mount */
if (!ofs->indexdir) {
ofs->config.index = false;
- if (ofs->upper_mnt && ofs->config.nfs_export) {
+ if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) {
pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n");
ofs->config.nfs_export = false;
}
@@ -1818,6 +1888,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = ovl_xattr_handlers;
sb->s_fs_info = ofs;
sb->s_flags |= SB_POSIXACL;
+ sb->s_iflags |= SB_I_SKIP_SYNC;
err = -ENOMEM;
root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
@@ -1825,6 +1896,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
goto out_free_oe;
mntput(upperpath.mnt);
+ kfree(splitlower);
sb->s_root = root_dentry;
@@ -1834,6 +1906,7 @@ out_free_oe:
ovl_entry_stack_free(oe);
kfree(oe);
out_err:
+ kfree(splitlower);
path_put(&upperpath);
ovl_free_fs(ofs);
out:
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 36b60788ee47..56c1f89f20c9 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -18,13 +18,13 @@
int ovl_want_write(struct dentry *dentry)
{
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
- return mnt_want_write(ofs->upper_mnt);
+ return mnt_want_write(ovl_upper_mnt(ofs));
}
void ovl_drop_write(struct dentry *dentry)
{
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
- mnt_drop_write(ofs->upper_mnt);
+ mnt_drop_write(ovl_upper_mnt(ofs));
}
struct dentry *ovl_workdir(struct dentry *dentry)
@@ -150,7 +150,7 @@ void ovl_path_upper(struct dentry *dentry, struct path *path)
{
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
- path->mnt = ofs->upper_mnt;
+ path->mnt = ovl_upper_mnt(ofs);
path->dentry = ovl_dentry_upper(dentry);
}
@@ -459,7 +459,32 @@ bool ovl_is_whiteout(struct dentry *dentry)
struct file *ovl_path_open(struct path *path, int flags)
{
- return dentry_open(path, flags | O_NOATIME, current_cred());
+ struct inode *inode = d_inode(path->dentry);
+ int err, acc_mode;
+
+ if (flags & ~(O_ACCMODE | O_LARGEFILE))
+ BUG();
+
+ switch (flags & O_ACCMODE) {
+ case O_RDONLY:
+ acc_mode = MAY_READ;
+ break;
+ case O_WRONLY:
+ acc_mode = MAY_WRITE;
+ break;
+ default:
+ BUG();
+ }
+
+ err = inode_permission(inode, acc_mode | MAY_OPEN);
+ if (err)
+ return ERR_PTR(err);
+
+ /* O_NOATIME is an optimization, don't fail if not permitted */
+ if (inode_owner_or_capable(inode))
+ flags |= O_NOATIME;
+
+ return dentry_open(path, flags, current_cred());
}
/* Caller should hold ovl_inode->lock */
@@ -707,7 +732,8 @@ static void ovl_cleanup_index(struct dentry *dentry)
index = NULL;
} else if (ovl_index_all(dentry->d_sb)) {
/* Whiteout orphan index to block future open by handle */
- err = ovl_cleanup_and_whiteout(indexdir, dir, index);
+ err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb),
+ dir, index);
} else {
/* Cleanup orphan index entries */
err = ovl_cleanup(dir, index);
diff --git a/fs/pipe.c b/fs/pipe.c
index c7c4fb5f345f..60dbee457143 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -24,6 +24,7 @@
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
+#include <linux/watch_queue.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
@@ -259,14 +260,44 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->note_loss) {
+ struct watch_notification n;
+
+ if (total_len < 8) {
+ if (ret == 0)
+ ret = -ENOBUFS;
+ break;
+ }
+
+ n.type = WATCH_TYPE_META;
+ n.subtype = WATCH_META_LOSS_NOTIFICATION;
+ n.info = watch_sizeof(n);
+ if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
+ if (ret == 0)
+ ret = -EFAULT;
+ break;
+ }
+ ret += sizeof(n);
+ total_len -= sizeof(n);
+ pipe->note_loss = false;
+ }
+#endif
+
if (!pipe_empty(head, tail)) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t chars = buf->len;
size_t written;
int error;
- if (chars > total_len)
+ if (chars > total_len) {
+ if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
+ if (ret == 0)
+ ret = -ENOBUFS;
+ break;
+ }
chars = total_len;
+ }
error = pipe_buf_confirm(pipe, buf);
if (error) {
@@ -294,6 +325,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (!buf->len) {
pipe_buf_release(pipe, buf);
spin_lock_irq(&pipe->rd_wait.lock);
+#ifdef CONFIG_WATCH_QUEUE
+ if (buf->flags & PIPE_BUF_FLAG_LOSS)
+ pipe->note_loss = true;
+#endif
tail++;
pipe->tail = tail;
spin_unlock_irq(&pipe->rd_wait.lock);
@@ -405,6 +440,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue) {
+ ret = -EXDEV;
+ goto out;
+ }
+#endif
+
/*
* Only wake up if the pipe started out empty, since
* otherwise there should be no readers waiting.
@@ -574,22 +616,37 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
int count, head, tail, mask;
switch (cmd) {
- case FIONREAD:
- __pipe_lock(pipe);
- count = 0;
- head = pipe->head;
- tail = pipe->tail;
- mask = pipe->ring_size - 1;
+ case FIONREAD:
+ __pipe_lock(pipe);
+ count = 0;
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
- while (tail != head) {
- count += pipe->bufs[tail & mask].len;
- tail++;
- }
- __pipe_unlock(pipe);
+ while (tail != head) {
+ count += pipe->bufs[tail & mask].len;
+ tail++;
+ }
+ __pipe_unlock(pipe);
+
+ return put_user(count, (int __user *)arg);
- return put_user(count, (int __user *)arg);
- default:
- return -ENOIOCTLCMD;
+#ifdef CONFIG_WATCH_QUEUE
+ case IOC_WATCH_QUEUE_SET_SIZE: {
+ int ret;
+ __pipe_lock(pipe);
+ ret = watch_queue_set_size(pipe, arg);
+ __pipe_unlock(pipe);
+ return ret;
+ }
+
+ case IOC_WATCH_QUEUE_SET_FILTER:
+ return watch_queue_set_filter(
+ pipe, (struct watch_notification_filter __user *)arg);
+#endif
+
+ default:
+ return -ENOIOCTLCMD;
}
}
@@ -700,27 +757,27 @@ pipe_fasync(int fd, struct file *filp, int on)
return retval;
}
-static unsigned long account_pipe_buffers(struct user_struct *user,
- unsigned long old, unsigned long new)
+unsigned long account_pipe_buffers(struct user_struct *user,
+ unsigned long old, unsigned long new)
{
return atomic_long_add_return(new - old, &user->pipe_bufs);
}
-static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
+bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
return soft_limit && user_bufs > soft_limit;
}
-static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
+bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
return hard_limit && user_bufs > hard_limit;
}
-static bool is_unprivileged_user(void)
+bool pipe_is_unprivileged_user(void)
{
return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
}
@@ -742,12 +799,12 @@ struct pipe_inode_info *alloc_pipe_info(void)
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
- if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) {
+ if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
pipe_bufs = 1;
}
- if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user())
+ if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
@@ -759,6 +816,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
+ pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
@@ -776,7 +834,14 @@ void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
- (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0);
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue) {
+ watch_queue_clear(pipe->watch_queue);
+ put_watch_queue(pipe->watch_queue);
+ }
+#endif
+
+ (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
free_uid(pipe->user);
for (i = 0; i < pipe->ring_size; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
@@ -852,6 +917,17 @@ int create_pipe_files(struct file **res, int flags)
if (!inode)
return -ENFILE;
+ if (flags & O_NOTIFICATION_PIPE) {
+#ifdef CONFIG_WATCH_QUEUE
+ if (watch_queue_init(inode->i_pipe) < 0) {
+ iput(inode);
+ return -ENOMEM;
+ }
+#else
+ return -ENOPKG;
+#endif
+ }
+
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
&pipefifo_fops);
@@ -882,7 +958,7 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags)
int error;
int fdw, fdr;
- if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
+ if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
return -EINVAL;
error = create_pipe_files(files, flags);
@@ -1130,42 +1206,12 @@ unsigned int round_pipe_size(unsigned long size)
}
/*
- * Allocate a new array of pipe buffers and copy the info over. Returns the
- * pipe size if successful, or return -ERROR on error.
+ * Resize the pipe ring to a number of slots.
*/
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
struct pipe_buffer *bufs;
- unsigned int size, nr_slots, head, tail, mask, n;
- unsigned long user_bufs;
- long ret = 0;
-
- size = round_pipe_size(arg);
- nr_slots = size >> PAGE_SHIFT;
-
- if (!nr_slots)
- return -EINVAL;
-
- /*
- * If trying to increase the pipe capacity, check that an
- * unprivileged user is not trying to exceed various limits
- * (soft limit check here, hard limit check just below).
- * Decreasing the pipe capacity is always permitted, even
- * if the user is currently over a limit.
- */
- if (nr_slots > pipe->ring_size &&
- size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots);
-
- if (nr_slots > pipe->ring_size &&
- (too_many_pipe_buffers_hard(user_bufs) ||
- too_many_pipe_buffers_soft(user_bufs)) &&
- is_unprivileged_user()) {
- ret = -EPERM;
- goto out_revert_acct;
- }
+ unsigned int head, tail, mask, n;
/*
* We can shrink the pipe, if arg is greater than the ring occupancy.
@@ -1177,17 +1223,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
head = pipe->head;
tail = pipe->tail;
n = pipe_occupancy(pipe->head, pipe->tail);
- if (nr_slots < n) {
- ret = -EBUSY;
- goto out_revert_acct;
- }
+ if (nr_slots < n)
+ return -EBUSY;
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (unlikely(!bufs)) {
- ret = -ENOMEM;
- goto out_revert_acct;
- }
+ if (unlikely(!bufs))
+ return -ENOMEM;
/*
* The pipe array wraps around, so just start the new one at zero
@@ -1215,16 +1257,68 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->ring_size = nr_slots;
- pipe->max_usage = nr_slots;
+ if (pipe->max_usage > nr_slots)
+ pipe->max_usage = nr_slots;
pipe->tail = tail;
pipe->head = head;
/* This might have made more room for writers */
wake_up_interruptible(&pipe->wr_wait);
+ return 0;
+}
+
+/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+ unsigned long user_bufs;
+ unsigned int nr_slots, size;
+ long ret = 0;
+
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue)
+ return -EBUSY;
+#endif
+
+ size = round_pipe_size(arg);
+ nr_slots = size >> PAGE_SHIFT;
+
+ if (!nr_slots)
+ return -EINVAL;
+
+ /*
+ * If trying to increase the pipe capacity, check that an
+ * unprivileged user is not trying to exceed various limits
+ * (soft limit check here, hard limit check just below).
+ * Decreasing the pipe capacity is always permitted, even
+ * if the user is currently over a limit.
+ */
+ if (nr_slots > pipe->max_usage &&
+ size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
+
+ if (nr_slots > pipe->max_usage &&
+ (too_many_pipe_buffers_hard(user_bufs) ||
+ too_many_pipe_buffers_soft(user_bufs)) &&
+ pipe_is_unprivileged_user()) {
+ ret = -EPERM;
+ goto out_revert_acct;
+ }
+
+ ret = pipe_resize_ring(pipe, nr_slots);
+ if (ret < 0)
+ goto out_revert_acct;
+
+ pipe->max_usage = nr_slots;
+ pipe->nr_accounted = nr_slots;
return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
- (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size);
+ (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
return ret;
}
@@ -1233,9 +1327,17 @@ out_revert_acct:
* location, so checking ->i_pipe is not enough to verify that this is a
* pipe.
*/
-struct pipe_inode_info *get_pipe_info(struct file *file)
+struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
- return file->f_op == &pipefifo_fops ? file->private_data : NULL;
+ struct pipe_inode_info *pipe = file->private_data;
+
+ if (file->f_op != &pipefifo_fops || !pipe)
+ return NULL;
+#ifdef CONFIG_WATCH_QUEUE
+ if (for_splice && pipe->watch_queue)
+ return NULL;
+#endif
+ return pipe;
}
long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1243,7 +1345,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
struct pipe_inode_info *pipe;
long ret;
- pipe = get_pipe_info(file);
+ pipe = get_pipe_info(file, false);
if (!pipe)
return -EBADF;
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 971a42f6357d..c930001056f9 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -66,7 +66,7 @@ config PROC_SYSCTL
depends on PROC_FS
select SYSCTL
default y
- ---help---
+ help
The sysctl interface provides a means of dynamically changing
certain kernel parameters and variables on the fly without requiring
a recompile of the kernel or reboot of the system. The primary
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 713ffac59bbb..55ecbeb3a721 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -92,7 +92,6 @@
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
-#include <asm/pgtable.h>
#include <asm/processor.h>
#include "internal.h"
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 066d9c0f4664..d86c0afc8a85 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2112,11 +2112,11 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
goto out;
if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
- status = down_read_killable(&mm->mmap_sem);
+ status = mmap_read_lock_killable(mm);
if (!status) {
exact_vma_exists = !!find_exact_vma(mm, vm_start,
vm_end);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
}
@@ -2163,7 +2163,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
if (rc)
goto out_mmput;
- rc = down_read_killable(&mm->mmap_sem);
+ rc = mmap_read_lock_killable(mm);
if (rc)
goto out_mmput;
@@ -2174,7 +2174,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
path_get(path);
rc = 0;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out_mmput:
mmput(mm);
@@ -2264,7 +2264,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
goto out_put_task;
result = ERR_PTR(-EINTR);
- if (down_read_killable(&mm->mmap_sem))
+ if (mmap_read_lock_killable(mm))
goto out_put_mm;
result = ERR_PTR(-ENOENT);
@@ -2277,7 +2277,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
(void *)(unsigned long)vma->vm_file->f_mode);
out_no_vma:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out_put_mm:
mmput(mm);
out_put_task:
@@ -2322,7 +2322,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
if (!mm)
goto out_put_task;
- ret = down_read_killable(&mm->mmap_sem);
+ ret = mmap_read_lock_killable(mm);
if (ret) {
mmput(mm);
goto out_put_task;
@@ -2333,11 +2333,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
/*
* We need two passes here:
*
- * 1) Collect vmas of mapped files with mmap_sem taken
- * 2) Release mmap_sem and instantiate entries
+ * 1) Collect vmas of mapped files with mmap_lock taken
+ * 2) Release mmap_lock and instantiate entries
*
* otherwise we get lockdep complained, since filldir()
- * routine might require mmap_sem taken in might_fault().
+ * routine might require mmap_lock taken in might_fault().
*/
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
@@ -2349,7 +2349,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
if (!p) {
ret = -ENOMEM;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
goto out_put_task;
}
@@ -2358,7 +2358,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
p->end = vma->vm_end;
p->mode = vma->vm_file->f_mode;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
for (i = 0; i < nr_files; i++) {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index f40c2532c057..28d6105e908e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -617,7 +617,7 @@ const struct inode_operations proc_link_inode_operations = {
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
- struct inode *inode = new_inode_pseudo(sb);
+ struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = de->low_ino;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index ecc63ce01be7..e9a6841fc25b 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -17,7 +17,6 @@
#include <linux/cma.h>
#endif
#include <asm/page.h>
-#include <asm/pgtable.h>
#include "internal.h"
void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 14c2badb8fd9..13452b32e2bd 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -22,7 +22,6 @@
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/div64.h>
#include "internal.h"
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5b405f32971d..42c5128c7d1c 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -565,6 +565,10 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
if (!table->proc_handler)
goto out;
+ /* don't even try if the size is too large */
+ if (count > KMALLOC_MAX_SIZE)
+ return -ENOMEM;
+
if (write) {
kbuf = memdup_user_nul(ubuf, count);
if (IS_ERR(kbuf)) {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ffebed1999e5..5e444d4f9717 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -264,11 +264,13 @@ static void proc_kill_sb(struct super_block *sb)
{
struct proc_fs_info *fs_info = proc_sb_info(sb);
- if (fs_info->proc_self)
- dput(fs_info->proc_self);
+ if (!fs_info) {
+ kill_anon_super(sb);
+ return;
+ }
- if (fs_info->proc_thread_self)
- dput(fs_info->proc_thread_self);
+ dput(fs_info->proc_self);
+ dput(fs_info->proc_thread_self);
kill_anon_super(sb);
put_pid_ns(fs_info->pid_ns);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ca5158fa561c..72cd69bcaf4a 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -43,7 +43,7 @@ int proc_setup_self(struct super_block *s)
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
if (self) {
- struct inode *inode = new_inode_pseudo(s);
+ struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6ad407d5efe2..dbda4499a859 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -145,7 +145,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return NULL;
}
- if (down_read_killable(&mm->mmap_sem)) {
+ if (mmap_read_lock_killable(mm)) {
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -188,7 +188,7 @@ static void m_stop(struct seq_file *m, void *v)
return;
release_task_mempolicy(priv);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -593,7 +593,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (pmd_trans_unstable(pmd))
goto out;
/*
- * The mmap_sem held all the way back in m_start() is what
+ * The mmap_lock held all the way back in m_start() is what
* keeps khugepaged out of here and from collapsing things
* in here.
*/
@@ -752,7 +752,7 @@ static void smap_gather_stats(struct vm_area_struct *vma,
}
}
#endif
- /* mmap_sem is held in m_start */
+ /* mmap_lock is held in m_start */
walk_page_vma(vma, &smaps_walk_ops, mss);
}
@@ -847,7 +847,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
memset(&mss, 0, sizeof(mss));
- ret = down_read_killable(&mm->mmap_sem);
+ ret = mmap_read_lock_killable(mm);
if (ret)
goto out_put_mm;
@@ -866,7 +866,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
__show_smap(m, &mss, true);
release_task_mempolicy(priv);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out_put_mm:
mmput(mm);
@@ -1140,7 +1140,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
};
if (type == CLEAR_REFS_MM_HIWATER_RSS) {
- if (down_write_killable(&mm->mmap_sem)) {
+ if (mmap_write_lock_killable(mm)) {
count = -EINTR;
goto out_mm;
}
@@ -1150,11 +1150,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
* resident set size to this mm's current rss value.
*/
reset_mm_hiwater_rss(mm);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
goto out_mm;
}
- if (down_read_killable(&mm->mmap_sem)) {
+ if (mmap_read_lock_killable(mm)) {
count = -EINTR;
goto out_mm;
}
@@ -1163,8 +1163,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
- up_read(&mm->mmap_sem);
- if (down_write_killable(&mm->mmap_sem)) {
+ mmap_read_unlock(mm);
+ if (mmap_write_lock_killable(mm)) {
count = -EINTR;
goto out_mm;
}
@@ -1183,14 +1183,14 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
* failed like if
* get_proc_task() fails?
*/
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
}
- downgrade_write(&mm->mmap_sem);
+ mmap_write_downgrade(mm);
break;
}
@@ -1203,7 +1203,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, 0, -1);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out_mm:
mmput(mm);
}
@@ -1564,11 +1564,11 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
/* overflow ? */
if (end < start_vaddr || end > end_vaddr)
end = end_vaddr;
- ret = down_read_killable(&mm->mmap_sem);
+ ret = mmap_read_lock_killable(mm);
if (ret)
goto out_free;
ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
start_vaddr = end;
len = min(count, PM_ENTRY_BYTES * pm.pos);
@@ -1827,7 +1827,7 @@ static int show_numa_map(struct seq_file *m, void *v)
if (is_vm_hugetlb_page(vma))
seq_puts(m, " huge");
- /* mmap_sem is held by m_start */
+ /* mmap_lock is held by m_start */
walk_page_vma(vma, &show_numa_ops, md);
if (!md->pages)
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 7907e6419e57..a6d21fc0033c 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -25,7 +25,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
struct rb_node *p;
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
vma = rb_entry(p, struct vm_area_struct, vm_rb);
@@ -77,7 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"Shared:\t%8lu bytes\n",
bytes, slack, sbytes);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
unsigned long task_vsize(struct mm_struct *mm)
@@ -86,12 +86,12 @@ unsigned long task_vsize(struct mm_struct *mm)
struct rb_node *p;
unsigned long vsize = 0;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
vma = rb_entry(p, struct vm_area_struct, vm_rb);
vsize += vma->vm_end - vma->vm_start;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return vsize;
}
@@ -104,7 +104,7 @@ unsigned long task_statm(struct mm_struct *mm,
struct rb_node *p;
unsigned long size = kobjsize(mm);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
vma = rb_entry(p, struct vm_area_struct, vm_rb);
size += kobjsize(vma);
@@ -119,7 +119,7 @@ unsigned long task_statm(struct mm_struct *mm,
>> PAGE_SHIFT;
*data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK))
>> PAGE_SHIFT;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
size >>= PAGE_SHIFT;
size += *text + *data;
*resident = size;
@@ -211,7 +211,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
if (!mm || !mmget_not_zero(mm))
return NULL;
- if (down_read_killable(&mm->mmap_sem)) {
+ if (mmap_read_lock_killable(mm)) {
mmput(mm);
return ERR_PTR(-EINTR);
}
@@ -221,7 +221,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
if (n-- == 0)
return p;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
return NULL;
}
@@ -231,7 +231,7 @@ static void m_stop(struct seq_file *m, void *_vml)
struct proc_maps_private *priv = m->private;
if (!IS_ERR_OR_NULL(_vml)) {
- up_read(&priv->mm->mmap_sem);
+ mmap_read_unlock(priv->mm);
mmput(priv->mm);
}
if (priv->task) {
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index ac284f409568..a553273fbd41 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -43,7 +43,7 @@ int proc_setup_thread_self(struct super_block *s)
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
if (thread_self) {
- struct inode *inode = new_inode_pseudo(s);
+ struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index c663202da8de..c3a345c28a93 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -27,7 +27,6 @@
#include <linux/pagemap.h>
#include <linux/uaccess.h>
#include <linux/mem_encrypt.h>
-#include <asm/pgtable.h>
#include <asm/io.h>
#include "internal.h"
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 9737b8e68878..8eb87008b55a 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -2,7 +2,7 @@
config ROMFS_FS
tristate "ROM file system support"
depends on BLOCK || MTD
- ---help---
+ help
This is a very small read-only file system mainly intended for
initial ram disks of installation disks, but it could be used for
other read-only media as well. Read
diff --git a/fs/select.c b/fs/select.c
index 11d0285d46b7..7aef49552d4c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -766,22 +766,38 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
* which has a pointer to the sigset_t itself followed by a size_t containing
* the sigset size.
*/
+struct sigset_argpack {
+ sigset_t __user *p;
+ size_t size;
+};
+
+static inline int get_sigset_argpack(struct sigset_argpack *to,
+ struct sigset_argpack __user *from)
+{
+ // the path is hot enough for overhead of copy_from_user() to matter
+ if (from) {
+ if (!user_read_access_begin(from, sizeof(*from)))
+ return -EFAULT;
+ unsafe_get_user(to->p, &from->p, Efault);
+ unsafe_get_user(to->size, &from->size, Efault);
+ user_read_access_end();
+ }
+ return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
fd_set __user *, exp, struct __kernel_timespec __user *, tsp,
void __user *, sig)
{
- size_t sigsetsize = 0;
- sigset_t __user *up = NULL;
-
- if (sig) {
- if (!access_ok(sig, sizeof(void *)+sizeof(size_t))
- || __get_user(up, (sigset_t __user * __user *)sig)
- || __get_user(sigsetsize,
- (size_t __user *)(sig+sizeof(void *))))
- return -EFAULT;
- }
+ struct sigset_argpack x = {NULL, 0};
+
+ if (get_sigset_argpack(&x, sig))
+ return -EFAULT;
- return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize, PT_TIMESPEC);
+ return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC);
}
#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
@@ -790,18 +806,12 @@ SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *,
fd_set __user *, exp, struct old_timespec32 __user *, tsp,
void __user *, sig)
{
- size_t sigsetsize = 0;
- sigset_t __user *up = NULL;
-
- if (sig) {
- if (!access_ok(sig, sizeof(void *)+sizeof(size_t))
- || __get_user(up, (sigset_t __user * __user *)sig)
- || __get_user(sigsetsize,
- (size_t __user *)(sig+sizeof(void *))))
- return -EFAULT;
- }
+ struct sigset_argpack x = {NULL, 0};
+
+ if (get_sigset_argpack(&x, sig))
+ return -EFAULT;
- return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize, PT_OLD_TIMESPEC);
+ return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC);
}
#endif
@@ -1325,24 +1335,37 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
return poll_select_finish(&end_time, tsp, type, ret);
}
+struct compat_sigset_argpack {
+ compat_uptr_t p;
+ compat_size_t size;
+};
+static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to,
+ struct compat_sigset_argpack __user *from)
+{
+ if (from) {
+ if (!user_read_access_begin(from, sizeof(*from)))
+ return -EFAULT;
+ unsafe_get_user(to->p, &from->p, Efault);
+ unsafe_get_user(to->size, &from->size, Efault);
+ user_read_access_end();
+ }
+ return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
struct __kernel_timespec __user *, tsp, void __user *, sig)
{
- compat_size_t sigsetsize = 0;
- compat_uptr_t up = 0;
-
- if (sig) {
- if (!access_ok(sig,
- sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
- __get_user(up, (compat_uptr_t __user *)sig) ||
- __get_user(sigsetsize,
- (compat_size_t __user *)(sig+sizeof(up))))
- return -EFAULT;
- }
+ struct compat_sigset_argpack x = {0, 0};
+
+ if (get_compat_sigset_argpack(&x, sig))
+ return -EFAULT;
- return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
- sigsetsize, PT_TIMESPEC);
+ return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p),
+ x.size, PT_TIMESPEC);
}
#if defined(CONFIG_COMPAT_32BIT_TIME)
@@ -1351,20 +1374,13 @@ COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
struct old_timespec32 __user *, tsp, void __user *, sig)
{
- compat_size_t sigsetsize = 0;
- compat_uptr_t up = 0;
-
- if (sig) {
- if (!access_ok(sig,
- sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
- __get_user(up, (compat_uptr_t __user *)sig) ||
- __get_user(sigsetsize,
- (compat_size_t __user *)(sig+sizeof(up))))
- return -EFAULT;
- }
+ struct compat_sigset_argpack x = {0, 0};
+
+ if (get_compat_sigset_argpack(&x, sig))
+ return -EFAULT;
- return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
- sigsetsize, PT_OLD_TIMESPEC);
+ return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p),
+ x.size, PT_OLD_TIMESPEC);
}
#endif
diff --git a/fs/splice.c b/fs/splice.c
index 6b3c9a018a8e..d7c8a7c4db07 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1101,8 +1101,8 @@ long do_splice(struct file *in, loff_t __user *off_in,
!(out->f_mode & FMODE_WRITE)))
return -EBADF;
- ipipe = get_pipe_info(in);
- opipe = get_pipe_info(out);
+ ipipe = get_pipe_info(in, true);
+ opipe = get_pipe_info(out, true);
if (ipipe && opipe) {
if (off_in || off_out)
@@ -1252,7 +1252,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
unsigned int flags)
{
- struct pipe_inode_info *pipe = get_pipe_info(file);
+ struct pipe_inode_info *pipe = get_pipe_info(file, true);
struct splice_desc sd = {
.total_len = iov_iter_count(iter),
.flags = flags,
@@ -1287,7 +1287,7 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
if (flags & SPLICE_F_GIFT)
buf_flag = PIPE_BUF_FLAG_GIFT;
- pipe = get_pipe_info(file);
+ pipe = get_pipe_info(file, true);
if (!pipe)
return -EBADF;
@@ -1733,8 +1733,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
*/
long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
{
- struct pipe_inode_info *ipipe = get_pipe_info(in);
- struct pipe_inode_info *opipe = get_pipe_info(out);
+ struct pipe_inode_info *ipipe = get_pipe_info(in, true);
+ struct pipe_inode_info *opipe = get_pipe_info(out, true);
int ret = -EINVAL;
if (unlikely(!(in->f_mode & FMODE_READ) ||
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 7187bd1a30ea..8d64edb80ebf 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -262,7 +262,7 @@ struct squashfs_dir_index {
__le32 index;
__le32 start_block;
__le32 size;
- unsigned char name[0];
+ unsigned char name[];
};
struct squashfs_base_inode {
@@ -327,7 +327,7 @@ struct squashfs_symlink_inode {
__le32 inode_number;
__le32 nlink;
__le32 symlink_size;
- char symlink[0];
+ char symlink[];
};
struct squashfs_reg_inode {
@@ -341,7 +341,7 @@ struct squashfs_reg_inode {
__le32 fragment;
__le32 offset;
__le32 file_size;
- __le16 block_list[0];
+ __le16 block_list[];
};
struct squashfs_lreg_inode {
@@ -358,7 +358,7 @@ struct squashfs_lreg_inode {
__le32 fragment;
__le32 offset;
__le32 xattr;
- __le16 block_list[0];
+ __le16 block_list[];
};
struct squashfs_dir_inode {
@@ -389,7 +389,7 @@ struct squashfs_ldir_inode {
__le16 i_count;
__le16 offset;
__le32 xattr;
- struct squashfs_dir_index index[0];
+ struct squashfs_dir_index index[];
};
union squashfs_inode {
@@ -410,7 +410,7 @@ struct squashfs_dir_entry {
__le16 inode_number;
__le16 type;
__le16 size;
- char name[0];
+ char name[];
};
struct squashfs_dir_header {
@@ -428,12 +428,12 @@ struct squashfs_fragment_entry {
struct squashfs_xattr_entry {
__le16 type;
__le16 size;
- char data[0];
+ char data[];
};
struct squashfs_xattr_val {
__le32 vsize;
- char value[0];
+ char value[];
};
struct squashfs_xattr_id {
diff --git a/fs/super.c b/fs/super.c
index bf3b7685b52a..904459b35119 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -361,7 +361,7 @@ EXPORT_SYMBOL(deactivate_locked_super);
*/
void deactivate_super(struct super_block *s)
{
- if (!atomic_add_unless(&s->s_active, -1, 1)) {
+ if (!atomic_add_unless(&s->s_active, -1, 1)) {
down_write(&s->s_umount);
deactivate_locked_super(s);
}
diff --git a/fs/sync.c b/fs/sync.c
index c6f6f5be5682..1373a610dc78 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -76,7 +76,8 @@ static void sync_inodes_one_sb(struct super_block *sb, void *arg)
static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
- if (!sb_rdonly(sb) && sb->s_op->sync_fs)
+ if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
+ sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, *(int *)arg);
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e39fdec8a0b0..52de29000c7e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -234,7 +234,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
pte_t *ptep, pte;
bool ret = true;
- VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+ mmap_assert_locked(mm);
ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
@@ -286,7 +286,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pte_t *pte;
bool ret = true;
- VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+ mmap_assert_locked(mm);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -369,13 +369,13 @@ static inline bool userfaultfd_signal_pending(unsigned int flags)
* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
* recommendation in __lock_page_or_retry is not an understatement.
*
- * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
* not set.
*
* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
* set, VM_FAULT_RETRY can still be returned if and only if there are
- * fatal_signal_pending()s, and the mmap_sem must be released before
+ * fatal_signal_pending()s, and the mmap_lock must be released before
* returning it.
*/
vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
@@ -396,16 +396,16 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
* the no_page_table() helper in follow_page_mask(), but the
* shmem_vm_ops->fault method is invoked even during
- * coredumping without mmap_sem and it ends up here.
+ * coredumping without mmap_lock and it ends up here.
*/
if (current->flags & (PF_EXITING|PF_DUMPCORE))
goto out;
/*
- * Coredumping runs without mmap_sem so we can only check that
- * the mmap_sem is held, if PF_DUMPCORE was not set.
+ * Coredumping runs without mmap_lock so we can only check that
+ * the mmap_lock is held, if PF_DUMPCORE was not set.
*/
- WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
+ mmap_assert_locked(mm);
ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
@@ -422,7 +422,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
/*
* If it's already released don't get it. This avoids to loop
* in __get_user_pages if userfaultfd_release waits on the
- * caller of handle_userfault to release the mmap_sem.
+ * caller of handle_userfault to release the mmap_lock.
*/
if (unlikely(READ_ONCE(ctx->released))) {
/*
@@ -481,7 +481,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
- /* take the reference before dropping the mmap_sem */
+ /* take the reference before dropping the mmap_lock */
userfaultfd_ctx_get(ctx);
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
@@ -514,7 +514,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
vmf->address,
vmf->flags, reason);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (likely(must_wait && !READ_ONCE(ctx->released) &&
!userfaultfd_signal_pending(vmf->flags))) {
@@ -637,7 +637,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
struct mm_struct *mm = release_new_ctx->mm;
/* the various vma->vm_userfaultfd_ctx still points to it */
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
/* no task can run (and in turn coredump) yet */
VM_WARN_ON(!mmget_still_valid(mm));
for (vma = mm->mmap; vma; vma = vma->vm_next)
@@ -645,7 +645,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
}
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
userfaultfd_ctx_put(release_new_ctx);
}
@@ -799,7 +799,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
userfaultfd_ctx_get(ctx);
WRITE_ONCE(ctx->mmap_changing, true);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
msg_init(&ewq.msg);
@@ -890,11 +890,11 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* Flush page faults out of all CPUs. NOTE: all page faults
* must be retried without returning VM_FAULT_SIGBUS if
* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
- * changes while handle_userfault released the mmap_sem. So
+ * changes while handle_userfault released the mmap_lock. So
* it's critical that released is set to true (above), before
- * taking the mmap_sem for writing.
+ * taking the mmap_lock for writing.
*/
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
still_valid = mmget_still_valid(mm);
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -920,7 +920,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
mmput(mm);
wakeup:
/*
@@ -1248,7 +1248,7 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
/*
* To be sure waitqueue_active() is not reordered by the CPU
* before the pagetable update, use an explicit SMP memory
- * barrier here. PT lock release or up_read(mmap_sem) still
+ * barrier here. PT lock release or mmap_read_unlock(mm) still
* have release semantics that can allow the
* waitqueue_active() to be reordered before the pte update.
*/
@@ -1345,7 +1345,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (!mmget_not_zero(mm))
goto out;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
if (!mmget_still_valid(mm))
goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
@@ -1492,7 +1492,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
out_unlock:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
mmput(mm);
if (!ret) {
__u64 ioctls_out;
@@ -1547,7 +1547,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (!mmget_not_zero(mm))
goto out;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
if (!mmget_still_valid(mm))
goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
@@ -1664,7 +1664,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
out_unlock:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
mmput(mm);
out:
return ret;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 403c90309a8f..00db81eac80d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1173,7 +1173,7 @@ xfs_file_llseek(
* Locking for serialisation of IO during page faults. This results in a lock
* ordering of:
*
- * mmap_sem (MM)
+ * mmap_lock (MM)
* sb_start_pagefault(vfs, freeze)
* i_mmaplock (XFS - truncate serialisation)
* page_lock (MM)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 64f5f9a440ae..9aea7d68d8ab 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -145,17 +145,17 @@ xfs_ilock_attr_map_shared(
*
* i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
*
- * mmap_sem locking order:
+ * mmap_lock locking order:
*
- * i_rwsem -> page lock -> mmap_sem
- * mmap_sem -> i_mmap_lock -> page_lock
+ * i_rwsem -> page lock -> mmap_lock
+ * mmap_lock -> i_mmap_lock -> page_lock
*
- * The difference in mmap_sem locking order mean that we cannot hold the
+ * The difference in mmap_lock locking order mean that we cannot hold the
* i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
+ * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
* in get_user_pages() to map the user pages into the kernel address space for
* direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_sem.
+ * page faults already hold the mmap_lock.
*
* Hence to serialise fully against both syscall and mmap based IO, we need to
* take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
@@ -1630,7 +1630,7 @@ xfs_release(
return 0;
/*
* If we can't get the iolock just skip truncating the blocks
- * past EOF because we could deadlock with the mmap_sem
+ * past EOF because we could deadlock with the mmap_lock
* otherwise. We'll get another chance to drop them once the
* last reference to the inode is dropped, so we'll never leak
* blocks permanently.
@@ -2634,8 +2634,10 @@ xfs_ifree_cluster(
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
mp->m_bsize * igeo->blocks_per_cluster,
XBF_UNMAPPED, &bp);
- if (error)
+ if (error) {
+ xfs_perag_put(pag);
return error;
+ }
/*
* This buffer may not have been correctly initialised as we
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a40f88cf3ab7..a190212ca85d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1236,64 +1236,26 @@ xfs_ioctl_setattr_xflags(
return 0;
}
-/*
- * If we are changing DAX flags, we have to ensure the file is clean and any
- * cached objects in the address space are invalidated and removed. This
- * requires us to lock out other IO and page faults similar to a truncate
- * operation. The locks need to be held until the transaction has been committed
- * so that the cache invalidation is atomic with respect to the DAX flag
- * manipulation.
- */
-static int
-xfs_ioctl_setattr_dax_invalidate(
+static void
+xfs_ioctl_setattr_prepare_dax(
struct xfs_inode *ip,
- struct fsxattr *fa,
- int *join_flags)
+ struct fsxattr *fa)
{
- struct inode *inode = VFS_I(ip);
- struct super_block *sb = inode->i_sb;
- int error;
-
- *join_flags = 0;
-
- /*
- * It is only valid to set the DAX flag on regular files and
- * directories on filesystems where the block size is equal to the page
- * size. On directories it serves as an inherited hint so we don't
- * have to check the device for dax support or flush pagecache.
- */
- if (fa->fsx_xflags & FS_XFLAG_DAX) {
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
-
- if (!bdev_dax_supported(target->bt_bdev, sb->s_blocksize))
- return -EINVAL;
- }
-
- /* If the DAX state is not changing, we have nothing to do here. */
- if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
- return 0;
- if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode))
- return 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
if (S_ISDIR(inode->i_mode))
- return 0;
-
- /* lock, flush and invalidate mapping in preparation for flag change */
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
- error = filemap_write_and_wait(inode->i_mapping);
- if (error)
- goto out_unlock;
- error = invalidate_inode_pages2(inode->i_mapping);
- if (error)
- goto out_unlock;
-
- *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL;
- return 0;
+ return;
-out_unlock:
- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
- return error;
+ if ((mp->m_flags & XFS_MOUNT_DAX_ALWAYS) ||
+ (mp->m_flags & XFS_MOUNT_DAX_NEVER))
+ return;
+ if (((fa->fsx_xflags & FS_XFLAG_DAX) &&
+ !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) ||
+ (!(fa->fsx_xflags & FS_XFLAG_DAX) &&
+ (ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)))
+ d_mark_dontcache(inode);
}
/*
@@ -1301,17 +1263,10 @@ out_unlock:
* have permission to do so. On success, return a clean transaction and the
* inode locked exclusively ready for further operation specific checks. On
* failure, return an error without modifying or locking the inode.
- *
- * The inode might already be IO locked on call. If this is the case, it is
- * indicated in @join_flags and we take full responsibility for ensuring they
- * are unlocked from now on. Hence if we have an error here, we still have to
- * unlock them. Otherwise, once they are joined to the transaction, they will
- * be unlocked on commit/cancel.
*/
static struct xfs_trans *
xfs_ioctl_setattr_get_trans(
- struct xfs_inode *ip,
- int join_flags)
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -1328,8 +1283,7 @@ xfs_ioctl_setattr_get_trans(
goto out_unlock;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
- join_flags = 0;
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
* CAP_FOWNER overrides the following restrictions:
@@ -1350,8 +1304,6 @@ xfs_ioctl_setattr_get_trans(
out_cancel:
xfs_trans_cancel(tp);
out_unlock:
- if (join_flags)
- xfs_iunlock(ip, join_flags);
return ERR_PTR(error);
}
@@ -1476,7 +1428,6 @@ xfs_ioctl_setattr(
struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
int code;
- int join_flags = 0;
trace_xfs_ioctl_setattr(ip);
@@ -1500,18 +1451,9 @@ xfs_ioctl_setattr(
return code;
}
- /*
- * Changing DAX config may require inode locking for mapping
- * invalidation. These need to be held all the way to transaction commit
- * or cancel time, so need to be passed through to
- * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
- * appropriately.
- */
- code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags);
- if (code)
- goto error_free_dquots;
+ xfs_ioctl_setattr_prepare_dax(ip, fa);
- tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
+ tp = xfs_ioctl_setattr_get_trans(ip);
if (IS_ERR(tp)) {
code = PTR_ERR(tp);
goto error_free_dquots;
@@ -1639,7 +1581,6 @@ xfs_ioc_setxflags(
struct fsxattr fa;
struct fsxattr old_fa;
unsigned int flags;
- int join_flags = 0;
int error;
if (copy_from_user(&flags, arg, sizeof(flags)))
@@ -1656,18 +1597,9 @@ xfs_ioc_setxflags(
if (error)
return error;
- /*
- * Changing DAX config may require inode locking for mapping
- * invalidation. These need to be held all the way to transaction commit
- * or cancel time, so need to be passed through to
- * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
- * appropriately.
- */
- error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags);
- if (error)
- goto out_drop_write;
+ xfs_ioctl_setattr_prepare_dax(ip, &fa);
- tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
+ tp = xfs_ioctl_setattr_get_trans(ip);
if (IS_ERR(tp)) {
error = PTR_ERR(tp);
goto out_drop_write;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 202b2c0a9e9d..80a13c8561d8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -28,11 +28,11 @@
#include <linux/fiemap.h>
/*
- * Directories have different lock order w.r.t. mmap_sem compared to regular
+ * Directories have different lock order w.r.t. mmap_lock compared to regular
* files. This is due to readdir potentially triggering page faults on a user
* buffer inside filldir(), and this happens with the ilock on the directory
* held. For regular files, the lock order is the other way around - the
- * mmap_sem is taken during the page fault, and then we lock the ilock to do
+ * mmap_lock is taken during the page fault, and then we lock the ilock to do
* block mapping. Hence we need a different class for the directory ilock so
* that lockdep can tell them apart.
*/