diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-30 10:04:31 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-30 10:04:31 -0700 |
commit | 3513431926f9bfe3f4fcb06a39d9ec59b0470311 (patch) | |
tree | 57ff608d89c031a9f0c84a6f4fe3ae98677467fb | |
parent | 2287a51ba822384834dafc1c798453375d1107c7 (diff) | |
parent | e43de7f0862b8598cd1ef440e3b4701cd107ea40 (diff) |
Merge tag 'fsnotify_for_v5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs
Pull fsnotify updates from Jan Kara:
"fsnotify speedups when notification actually isn't used and support
for identifying processes which caused fanotify events through pidfd
instead of normal pid"
* tag 'fsnotify_for_v5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs:
fsnotify: optimize the case of no marks of any type
fsnotify: count all objects with attached connectors
fsnotify: count s_fsnotify_inode_refs for attached connectors
fsnotify: replace igrab() with ihold() on attach connector
fanotify: add pidfd support to the fanotify API
fanotify: introduce a generic info record copying helper
fanotify: minor cosmetic adjustments to fid labels
kernel/pid.c: implement additional checks upon pidfd_create() parameters
kernel/pid.c: remove static qualifier from pidfd_create()
-rw-r--r-- | fs/notify/fanotify/fanotify_user.c | 251 | ||||
-rw-r--r-- | fs/notify/fsnotify.c | 6 | ||||
-rw-r--r-- | fs/notify/fsnotify.h | 15 | ||||
-rw-r--r-- | fs/notify/mark.c | 52 | ||||
-rw-r--r-- | include/linux/fanotify.h | 3 | ||||
-rw-r--r-- | include/linux/fs.h | 7 | ||||
-rw-r--r-- | include/linux/fsnotify.h | 9 | ||||
-rw-r--r-- | include/linux/pid.h | 1 | ||||
-rw-r--r-- | include/uapi/linux/fanotify.h | 13 | ||||
-rw-r--r-- | kernel/pid.c | 15 |
10 files changed, 276 insertions, 96 deletions
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 28b67cb9458d..6facdf476255 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> +#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/anon_inodes.h> @@ -109,8 +110,10 @@ struct kmem_cache *fanotify_path_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; #define FANOTIFY_EVENT_ALIGN 4 -#define FANOTIFY_INFO_HDR_LEN \ +#define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ + sizeof(struct fanotify_event_info_pidfd) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -119,10 +122,11 @@ static int fanotify_fid_info_len(int fh_len, int name_len) if (name_len) info_len += name_len + 1; - return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); + return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, + FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(unsigned int fid_mode, +static int fanotify_event_info_len(unsigned int info_mode, struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); @@ -133,7 +137,8 @@ static int fanotify_event_info_len(unsigned int fid_mode, if (dir_fh_len) { info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); - } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { + } else if ((info_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not recorded in * event on a directory, we will report the name ".". @@ -141,6 +146,9 @@ static int fanotify_event_info_len(unsigned int fid_mode, dot_len = 1; } + if (info_mode & FAN_REPORT_PIDFD) + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; + if (fh_len) info_len += fanotify_fid_info_len(fh_len, dot_len); @@ -176,7 +184,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t event_size = FAN_EVENT_METADATA_LEN; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -186,8 +194,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, goto out; event = FANOTIFY_E(fsn_event); - if (fid_mode) - event_size += fanotify_event_info_len(fid_mode, event); + if (info_mode) + event_size += fanotify_event_info_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); @@ -308,9 +316,10 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } -static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, - int info_type, const char *name, size_t name_len, - char __user *buf, size_t count) +static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, + int info_type, const char *name, + size_t name_len, + char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; @@ -403,6 +412,117 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, return info_len; } +static int copy_pidfd_info_to_user(int pidfd, + char __user *buf, + size_t count) +{ + struct fanotify_event_info_pidfd info = { }; + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; + + if (WARN_ON_ONCE(info_len > count)) + return -EFAULT; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; + info.hdr.len = info_len; + info.pidfd = pidfd; + + if (copy_to_user(buf, &info, info_len)) + return -EFAULT; + + return info_len; +} + +static int copy_info_records_to_user(struct fanotify_event *event, + struct fanotify_info *info, + unsigned int info_mode, int pidfd, + char __user *buf, size_t count) +{ + int ret, total_bytes = 0, info_type = 0; + unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; + + /* + * Event info records order is as follows: dir fid + name, child fid. + */ + if (fanotify_event_dir_fh_len(event)) { + info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : + FAN_EVENT_INFO_TYPE_DFID; + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_info_dir_fh(info), + info_type, + fanotify_info_name(info), + info->name_len, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (fanotify_event_object_fh_len(event)) { + const char *dot = NULL; + int dot_len = 0; + + if (fid_mode == FAN_REPORT_FID || info_type) { + /* + * With only group flag FAN_REPORT_FID only type FID is + * reported. Second info record type is always FID. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } else if ((fid_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not + * recorded in an event on a directory, report the name + * "." with info type DFID_NAME. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; + dot = "."; + dot_len = 1; + } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_DIR_FID, a single info + * record has type DFID for directory entry modification + * event and for event on a directory. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID; + } else { + /* + * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, + * a single info record has type FID for event on a + * non-directory, when there is no directory to report. + * For example, on FAN_DELETE_SELF event. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } + + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_event_object_fh(event), + info_type, dot, dot_len, + buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (pidfd_mode) { + ret = copy_pidfd_info_to_user(pidfd, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + return total_bytes; +} + static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event *event, char __user *buf, size_t count) @@ -410,15 +530,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event_metadata metadata; struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; struct file *f = NULL; - int ret, fd = FAN_NOFD; - int info_type = 0; + int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, event); metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(fid_mode, event); + fanotify_event_info_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; @@ -447,6 +567,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, } metadata.fd = fd; + if (pidfd_mode) { + /* + * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual + * exclusion is ever lifted. At the time of incoporating pidfd + * support within fanotify, the pidfd API only supported the + * creation of pidfds for thread-group leaders. + */ + WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); + + /* + * The PIDTYPE_TGID check for an event->pid is performed + * preemptively in an attempt to catch out cases where the event + * listener reads events after the event generating process has + * already terminated. Report FAN_NOPIDFD to the event listener + * in those cases, with all other pidfd creation errors being + * reported as FAN_EPIDFD. + */ + if (metadata.pid == 0 || + !pid_has_task(event->pid, PIDTYPE_TGID)) { + pidfd = FAN_NOPIDFD; + } else { + pidfd = pidfd_create(event->pid, 0); + if (pidfd < 0) + pidfd = FAN_EPIDFD; + } + } + ret = -EFAULT; /* * Sanity check copy size in case get_one_event() and @@ -467,67 +614,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (f) fd_install(fd, f); - /* Event info records order is: dir fid + name, child fid */ - if (fanotify_event_dir_fh_len(event)) { - info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : - FAN_EVENT_INFO_TYPE_DFID; - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_info_dir_fh(info), - info_type, fanotify_info_name(info), - info->name_len, buf, count); + if (info_mode) { + ret = copy_info_records_to_user(event, info, info_mode, pidfd, + buf, count); if (ret < 0) goto out_close_fd; - - buf += ret; - count -= ret; - } - - if (fanotify_event_object_fh_len(event)) { - const char *dot = NULL; - int dot_len = 0; - - if (fid_mode == FAN_REPORT_FID || info_type) { - /* - * With only group flag FAN_REPORT_FID only type FID is - * reported. Second info record type is always FID. - */ - info_type = FAN_EVENT_INFO_TYPE_FID; - } else if ((fid_mode & FAN_REPORT_NAME) && - (event->mask & FAN_ONDIR)) { - /* - * With group flag FAN_REPORT_NAME, if name was not - * recorded in an event on a directory, report the - * name "." with info type DFID_NAME. - */ - info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; - dot = "."; - dot_len = 1; - } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || - (event->mask & FAN_ONDIR)) { - /* - * With group flag FAN_REPORT_DIR_FID, a single info - * record has type DFID for directory entry modification - * event and for event on a directory. - */ - info_type = FAN_EVENT_INFO_TYPE_DFID; - } else { - /* - * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, - * a single info record has type FID for event on a - * non-directory, when there is no directory to report. - * For example, on FAN_DELETE_SELF event. - */ - info_type = FAN_EVENT_INFO_TYPE_FID; - } - - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_event_object_fh(event), - info_type, dot, dot_len, buf, count); - if (ret < 0) - goto out_close_fd; - - buf += ret; - count -= ret; } return metadata.event_len; @@ -537,6 +628,10 @@ out_close_fd: put_unused_fd(fd); fput(f); } + + if (pidfd >= 0) + close_fd(pidfd); + return ret; } @@ -1082,6 +1177,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) #endif return -EINVAL; + /* + * A pidfd can only be returned for a thread-group leader; thus + * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually + * exclusive. + */ + if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) + return -EINVAL; + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -1483,7 +1586,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 30d422b8c0fc..963e6ce75b96 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -87,15 +87,15 @@ static void fsnotify_unmount_inodes(struct super_block *sb) if (iput_inode) iput(iput_inode); - /* Wait for outstanding inode references from connectors */ - wait_var_event(&sb->s_fsnotify_inode_refs, - !atomic_long_read(&sb->s_fsnotify_inode_refs)); } void fsnotify_sb_delete(struct super_block *sb) { fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); + /* Wait for outstanding object references from connectors */ + wait_var_event(&sb->s_fsnotify_connectors, + !atomic_long_read(&sb->s_fsnotify_connectors)); } /* diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index ff2063ec6b0f..87d8a50ee803 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -27,6 +27,21 @@ static inline struct super_block *fsnotify_conn_sb( return container_of(conn->obj, struct super_block, s_fsnotify_marks); } +static inline struct super_block *fsnotify_connector_sb( + struct fsnotify_mark_connector *conn) +{ + switch (conn->type) { + case FSNOTIFY_OBJ_TYPE_INODE: + return fsnotify_conn_inode(conn)->i_sb; + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + return fsnotify_conn_mount(conn)->mnt.mnt_sb; + case FSNOTIFY_OBJ_TYPE_SB: + return fsnotify_conn_sb(conn); + default: + return NULL; + } +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d32ab349db74..95006d1d29ab 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -169,6 +169,37 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } +static void fsnotify_get_inode_ref(struct inode *inode) +{ + ihold(inode); + atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); +} + +static void fsnotify_put_inode_ref(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + iput(inode); + if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) + wake_up_var(&sb->s_fsnotify_connectors); +} + +static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb) + atomic_long_inc(&sb->s_fsnotify_connectors); +} + +static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) + wake_up_var(&sb->s_fsnotify_connectors); +} + static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) @@ -182,13 +213,13 @@ static void *fsnotify_detach_connector_from_object( if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; - atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs); } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } + fsnotify_put_sb_connectors(conn); rcu_assign_pointer(*(conn->obj), NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; @@ -209,19 +240,12 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { - struct inode *inode; - struct super_block *sb; - if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; - inode = objp; - sb = inode->i_sb; - iput(inode); - if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs)) - wake_up_var(&sb->s_fsnotify_inode_refs); + fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) @@ -493,8 +517,12 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, conn->fsid.val[0] = conn->fsid.val[1] = 0; conn->flags = 0; } - if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - inode = igrab(fsnotify_conn_inode(conn)); + if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { + inode = fsnotify_conn_inode(conn); + fsnotify_get_inode_ref(inode); + } + fsnotify_get_sb_connectors(conn); + /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure @@ -502,7 +530,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ if (inode) - iput(inode); + fsnotify_put_inode_ref(inode); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index a16dbeced152..eec3b7c40811 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -27,6 +27,8 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ #define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DFID_NAME) +#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD) + /* * fanotify_init() flags that require CAP_SYS_ADMIN. * We do not allow unprivileged groups to request permission events. @@ -35,6 +37,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ */ #define FANOTIFY_ADMIN_INIT_FLAGS (FANOTIFY_PERM_CLASSES | \ FAN_REPORT_TID | \ + FAN_REPORT_PIDFD | \ FAN_UNLIMITED_QUEUE | \ FAN_UNLIMITED_MARKS) diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..bea8ec5c726c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1507,8 +1507,11 @@ struct super_block { /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; - /* Pending fsnotify inode refs */ - atomic_long_t s_fsnotify_inode_refs; + /* + * Number of inode/mount/sb objects that are being watched, note that + * inodes objects are currently double-accounted. + */ + atomic_long_t s_fsnotify_connectors; /* Being remounted read-only */ int s_readonly_remount; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index f8acddcf54fb..12d3a7d308ab 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -30,6 +30,9 @@ static inline void fsnotify_name(struct inode *dir, __u32 mask, struct inode *child, const struct qstr *name, u32 cookie) { + if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0) + return; + fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie); } @@ -41,6 +44,9 @@ static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, static inline void fsnotify_inode(struct inode *inode, __u32 mask) { + if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0) + return; + if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; @@ -53,6 +59,9 @@ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask, { struct inode *inode = d_inode(dentry); + if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0) + return 0; + if (S_ISDIR(inode->i_mode)) { mask |= FS_ISDIR; diff --git a/include/linux/pid.h b/include/linux/pid.h index fa10acb8d6a4..af308e15f174 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -78,6 +78,7 @@ struct file; extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); +int pidfd_create(struct pid *pid, unsigned int flags); static inline struct pid *get_pid(struct pid *pid) { diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index fbf9c5c7dd59..64553df9d735 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -51,6 +51,7 @@ #define FAN_ENABLE_AUDIT 0x00000040 /* Flags to determine fanotify event format */ +#define FAN_REPORT_PIDFD 0x00000080 /* Report pidfd for event->pid */ #define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */ #define FAN_REPORT_FID 0x00000200 /* Report unique file id */ #define FAN_REPORT_DIR_FID 0x00000400 /* Report unique directory id */ @@ -123,6 +124,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_FID 1 #define FAN_EVENT_INFO_TYPE_DFID_NAME 2 #define FAN_EVENT_INFO_TYPE_DFID 3 +#define FAN_EVENT_INFO_TYPE_PIDFD 4 /* Variable length info record following event metadata */ struct fanotify_event_info_header { @@ -148,6 +150,15 @@ struct fanotify_event_info_fid { unsigned char handle[0]; }; +/* + * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD. + * It holds a pidfd for the pid that was responsible for generating an event. + */ +struct fanotify_event_info_pidfd { + struct fanotify_event_info_header hdr; + __s32 pidfd; +}; + struct fanotify_response { __s32 fd; __u32 response; @@ -160,6 +171,8 @@ struct fanotify_response { /* No fd set in event */ #define FAN_NOFD -1 +#define FAN_NOPIDFD FAN_NOFD +#define FAN_EPIDFD -2 /* Helper functions to deal with fanotify_event_metadata buffers */ #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata)) diff --git a/kernel/pid.c b/kernel/pid.c index ebdf9c60cd0b..efe87db44683 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -550,13 +550,21 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) * Note, that this function can only be called after the fd table has * been unshared to avoid leaking the pidfd to the new process. * + * This symbol should not be explicitly exported to loadable modules. + * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -static int pidfd_create(struct pid *pid, unsigned int flags) +int pidfd_create(struct pid *pid, unsigned int flags) { int fd; + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), flags | O_RDWR | O_CLOEXEC); if (fd < 0) @@ -596,10 +604,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) if (!p) return -ESRCH; - if (pid_has_task(p, PIDTYPE_TGID)) - fd = pidfd_create(p, flags); - else - fd = -EINVAL; + fd = pidfd_create(p, flags); put_pid(p); return fd; |