From e1603b6effe177210701d3d7132d1b68e7bd2c93 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 9 Feb 2018 18:04:54 +0300
Subject: inotify: Extend ioctl to allow to request id of new watch descriptor

Watch descriptor is id of the watch created by inotify_add_watch().
It is allocated in inotify_add_to_idr(), and takes the numbers
starting from 1. Every new inotify watch obtains next available
number (usually, old + 1), as served by idr_alloc_cyclic().

CRIU (Checkpoint/Restore In Userspace) project supports inotify
files, and restores watched descriptors with the same numbers,
they had before dump. Since there was no kernel support, we
had to use cycle to add a watch with specific descriptor id:

	while (1) {
		int wd;

		wd = inotify_add_watch(inotify_fd, path, mask);
		if (wd < 0) {
			break;
		} else if (wd == desired_wd_id) {
			ret = 0;
			break;
		}

		inotify_rm_watch(inotify_fd, wd);
	}

(You may find the actual code at the below link:
 https://github.com/checkpoint-restore/criu/blob/v3.7/criu/fsnotify.c#L577)

The cycle is suboptiomal and very expensive, but since there is no better
kernel support, it was the only way to restore that. Happily, we had met
mostly descriptors with small id, and this approach had worked somehow.

But recent time containers with inotify with big watch descriptors
begun to come, and this way stopped to work at all. When descriptor id
is something about 0x34d71d6, the restoring process spins in busy loop
for a long time, and the restore hungs and delay of migration from node
to node could easily be watched.

This patch aims to solve this problem. It introduces new ioctl
INOTIFY_IOC_SETNEXTWD, which allows to request the number of next created
watch descriptor from userspace. It simply calls idr_set_cursor() primitive
to populate idr::idr_next, so that next idr_alloc_cyclic() allocation
will return this id, if it is not occupied. This is the way which is
used to restore some other resources from userspace. For example,
/proc/sys/kernel/ns_last_pid works the same for task pids.

The new code is under CONFIG_CHECKPOINT_RESTORE #define, so small system
may exclude it.

v2: Use INT_MAX instead of custom definition of max id,
as IDR subsystem guarantees id is between 0 and INT_MAX.

CC: Jan Kara <jack@suse.cz>
CC: Matthew Wilcox <willy@infradead.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/inotify/inotify_user.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 2c908b31d6c9..8f17719842ec 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -307,6 +307,20 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 		spin_unlock(&group->notification_lock);
 		ret = put_user(send_len, (int __user *) p);
 		break;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	case INOTIFY_IOC_SETNEXTWD:
+		ret = -EINVAL;
+		if (arg >= 1 && arg <= INT_MAX) {
+			struct inotify_group_private_data *data;
+
+			data = &group->inotify_data;
+			spin_lock(&data->idr_lock);
+			idr_set_cursor(&data->idr, (unsigned int)arg);
+			spin_unlock(&data->idr_lock);
+			ret = 0;
+		}
+		break;
+#endif /* CONFIG_CHECKPOINT_RESTORE */
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 1f5eaa90010ed7cf0ae90a526c48657d02c6086f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Feb 2018 14:10:59 +0100
Subject: fanotify: Avoid lost events due to ENOMEM for unlimited queues

Fanotify queues of unlimited length do not expect events can be lost.
Since these queues are used for system auditing and other security
related tasks, loosing events can even have security implications.
Currently, since the allocation is small (32-bytes), it cannot fail
however when we start accounting events in memcgs, allocation can start
failing. So avoid loosing events due to failure to allocate memory by
making event allocation use __GFP_NOFAIL.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      | 19 ++++++++++++++-----
 fs/notify/fanotify/fanotify.h      |  3 ++-
 fs/notify/fanotify/fanotify_user.c |  2 +-
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 6702a6a0bbb5..928f2a5eedb7 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -139,23 +139,32 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 	return false;
 }
 
-struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
+						 struct inode *inode, u32 mask,
 						 const struct path *path)
 {
 	struct fanotify_event_info *event;
+	gfp_t gfp = GFP_KERNEL;
+
+	/*
+	 * For queues with unlimited length lost events are not expected and
+	 * can possibly have security implications. Avoid losing events when
+	 * memory is short.
+	 */
+	if (group->max_events == UINT_MAX)
+		gfp |= __GFP_NOFAIL;
 
 	if (fanotify_is_perm_event(mask)) {
 		struct fanotify_perm_event_info *pevent;
 
-		pevent = kmem_cache_alloc(fanotify_perm_event_cachep,
-					  GFP_KERNEL);
+		pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
 		if (!pevent)
 			return NULL;
 		event = &pevent->fae;
 		pevent->response = 0;
 		goto init;
 	}
-	event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+	event = kmem_cache_alloc(fanotify_event_cachep, gfp);
 	if (!event)
 		return NULL;
 init: __maybe_unused
@@ -210,7 +219,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 			return 0;
 	}
 
-	event = fanotify_alloc_event(inode, mask, data);
+	event = fanotify_alloc_event(group, inode, mask, data);
 	ret = -ENOMEM;
 	if (unlikely(!event))
 		goto finish;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 256d9d1ddea9..8609ba06f474 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -52,5 +52,6 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
 	return container_of(fse, struct fanotify_event_info, fse);
 }
 
-struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
+						 struct inode *inode, u32 mask,
 						 const struct path *path);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c07eb3d655ea..72e367822efb 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -757,7 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	group->fanotify_data.user = user;
 	atomic_inc(&user->fanotify_listeners);
 
-	oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL);
+	oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
 	if (unlikely(!oevent)) {
 		fd = -ENOMEM;
 		goto out_destroy_group;
-- 
cgit v1.2.3


From 7b1f641776e0c8b824fb10135168e4b683a9e2ba Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Feb 2018 15:07:52 +0100
Subject: fsnotify: Let userspace know about lost events due to ENOMEM

Currently if notification event is lost due to event allocation failing
we ENOMEM, we just silently continue (except for fanotify permission
events where we deny the access). This is undesirable as userspace has
no way of knowing whether the notifications it got are complete or not.
Treat lost events due to ENOMEM the same way as lost events due to queue
overflow so that userspace knows something bad happened and it likely
needs to rescan the filesystem.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c        | 9 ++++++++-
 fs/notify/inotify/inotify_fsnotify.c | 8 +++++++-
 fs/notify/notification.c             | 3 ++-
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 928f2a5eedb7..d51e1bb781cf 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -221,8 +221,15 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 
 	event = fanotify_alloc_event(group, inode, mask, data);
 	ret = -ENOMEM;
-	if (unlikely(!event))
+	if (unlikely(!event)) {
+		/*
+		 * We don't queue overflow events for permission events as
+		 * there the access is denied and so no event is in fact lost.
+		 */
+		if (!fanotify_is_perm_event(mask))
+			fsnotify_queue_overflow(group);
 		goto finish;
+	}
 
 	fsn_event = &event->fse;
 	ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 8b73332735ba..40dedb37a1f3 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -99,8 +99,14 @@ int inotify_handle_event(struct fsnotify_group *group,
 			      fsn_mark);
 
 	event = kmalloc(alloc_len, GFP_KERNEL);
-	if (unlikely(!event))
+	if (unlikely(!event)) {
+		/*
+		 * Treat lost event due to ENOMEM the same way as queue
+		 * overflow to let userspace know event was lost.
+		 */
+		fsnotify_queue_overflow(group);
 		return -ENOMEM;
+	}
 
 	fsn_event = &event->fse;
 	fsnotify_init_event(fsn_event, inode, mask);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 66f85c651c52..3c3e36745f59 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -111,7 +111,8 @@ int fsnotify_add_event(struct fsnotify_group *group,
 		return 2;
 	}
 
-	if (group->q_len >= group->max_events) {
+	if (event == group->overflow_event ||
+	    group->q_len >= group->max_events) {
 		ret = 2;
 		/* Queue overflow event only if it isn't already queued */
 		if (!list_empty(&group->overflow_event->list)) {
-- 
cgit v1.2.3