From d507e2ebd2c7be9138e5cf5c0cb1931c90c42ab1 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 10 Aug 2017 15:23:31 -0700
Subject: mm: fix global NR_SLAB_.*CLAIMABLE counter reads

As Tetsuo points out:
 "Commit 385386cff4c6 ("mm: vmstat: move slab statistics from zone to
  node counters") broke "Slab:" field of /proc/meminfo . It shows nearly
  0kB"

In addition to /proc/meminfo, this problem also affects the slab
counters OOM/allocation failure info dumps, can cause early -ENOMEM from
overcommit protection, and miscalculate image size requirements during
suspend-to-disk.

This is because the patch in question switched the slab counters from
the zone level to the node level, but forgot to update the global
accessor functions to read the aggregate node data instead of the
aggregate zone data.

Use global_node_page_state() to access the global slab counters.

Fixes: 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters")
Link: http://lkml.kernel.org/r/20170801134256.5400-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Stefan Agner <stefan@agner.ch>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c       | 8 ++++----
 kernel/power/snapshot.c | 2 +-
 mm/page_alloc.c         | 9 +++++----
 mm/util.c               | 2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8a428498d6b2..509a61668d90 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -106,13 +106,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		    global_node_page_state(NR_FILE_MAPPED));
 	show_val_kb(m, "Shmem:          ", i.sharedram);
 	show_val_kb(m, "Slab:           ",
-		    global_page_state(NR_SLAB_RECLAIMABLE) +
-		    global_page_state(NR_SLAB_UNRECLAIMABLE));
+		    global_node_page_state(NR_SLAB_RECLAIMABLE) +
+		    global_node_page_state(NR_SLAB_UNRECLAIMABLE));
 
 	show_val_kb(m, "SReclaimable:   ",
-		    global_page_state(NR_SLAB_RECLAIMABLE));
+		    global_node_page_state(NR_SLAB_RECLAIMABLE));
 	show_val_kb(m, "SUnreclaim:     ",
-		    global_page_state(NR_SLAB_UNRECLAIMABLE));
+		    global_node_page_state(NR_SLAB_UNRECLAIMABLE));
 	seq_printf(m, "KernelStack:    %8lu kB\n",
 		   global_page_state(NR_KERNEL_STACK_KB));
 	show_val_kb(m, "PageTables:     ",
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 222317721c5a..0972a8e09d08 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
 {
 	unsigned long size;
 
-	size = global_page_state(NR_SLAB_RECLAIMABLE)
+	size = global_node_page_state(NR_SLAB_RECLAIMABLE)
 		+ global_node_page_state(NR_ACTIVE_ANON)
 		+ global_node_page_state(NR_INACTIVE_ANON)
 		+ global_node_page_state(NR_ACTIVE_FILE)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fc32aa81f359..626a430e32d1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4458,8 +4458,9 @@ long si_mem_available(void)
 	 * Part of the reclaimable slab consists of items that are in use,
 	 * and cannot be freed. Cap this estimate at the low watermark.
 	 */
-	available += global_page_state(NR_SLAB_RECLAIMABLE) -
-		     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+	available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
+		     min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
+			 wmark_low);
 
 	if (available < 0)
 		available = 0;
@@ -4602,8 +4603,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		global_node_page_state(NR_FILE_DIRTY),
 		global_node_page_state(NR_WRITEBACK),
 		global_node_page_state(NR_UNSTABLE_NFS),
-		global_page_state(NR_SLAB_RECLAIMABLE),
-		global_page_state(NR_SLAB_UNRECLAIMABLE),
+		global_node_page_state(NR_SLAB_RECLAIMABLE),
+		global_node_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_node_page_state(NR_FILE_MAPPED),
 		global_node_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
diff --git a/mm/util.c b/mm/util.c
index 7b07ec852e01..9ecddf568fe3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -633,7 +633,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		 * which are reclaimable, under pressure.  The dentry
 		 * cache and most inode caches should fall into this
 		 */
-		free += global_page_state(NR_SLAB_RECLAIMABLE);
+		free += global_node_page_state(NR_SLAB_RECLAIMABLE);
 
 		/*
 		 * Leave reserved pages. The pages are not for anonymous pages.
-- 
cgit v1.2.3


From 75dddef32514f7aa58930bde6a1263253bc3d4ba Mon Sep 17 00:00:00 2001
From: Jonathan Toppins <jtoppins@redhat.com>
Date: Thu, 10 Aug 2017 15:23:35 -0700
Subject: mm: ratelimit PFNs busy info message

The RDMA subsystem can generate several thousand of these messages per
second eventually leading to a kernel crash.  Ratelimit these messages
to prevent this crash.

Doug said:
 "I've been carrying a version of this for several kernel versions. I
  don't remember when they started, but we have one (and only one) class
  of machines: Dell PE R730xd, that generate these errors. When it
  happens, without a rate limit, we get rcu timeouts and kernel oopses.
  With the rate limit, we just get a lot of annoying kernel messages but
  the machine continues on, recovers, and eventually the memory
  operations all succeed"

And:
 "> Well... why are all these EBUSY's occurring? It sounds inefficient
  > (at least) but if it is expected, normal and unavoidable then
  > perhaps we should just remove that message altogether?

  I don't have an answer to that question. To be honest, I haven't
  looked real hard. We never had this at all, then it started out of the
  blue, but only on our Dell 730xd machines (and it hits all of them),
  but no other classes or brands of machines. And we have our 730xd
  machines loaded up with different brands and models of cards (for
  instance one dedicated to mlx4 hardware, one for qib, one for mlx5, an
  ocrdma/cxgb4 combo, etc), so the fact that it hit all of the machines
  meant it wasn't tied to any particular brand/model of RDMA hardware.
  To me, it always smelled of a hardware oddity specific to maybe the
  CPUs or mainboard chipsets in these machines, so given that I'm not an
  mm expert anyway, I never chased it down.

  A few other relevant details: it showed up somewhere around 4.8/4.9 or
  thereabouts. It never happened before, but the prinkt has been there
  since the 3.18 days, so possibly the test to trigger this message was
  changed, or something else in the allocator changed such that the
  situation started happening on these machines?

  And, like I said, it is specific to our 730xd machines (but they are
  all identical, so that could mean it's something like their specific
  ram configuration is causing the allocator to hit this on these
  machine but not on other machines in the cluster, I don't want to say
  it's necessarily the model of chipset or CPU, there are other bits of
  identicalness between these machines)"

Link: http://lkml.kernel.org/r/499c0f6cc10d6eb829a67f2a4d75b4228a9b356e.1501695897.git.jtoppins@redhat.com
Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
Reviewed-by: Doug Ledford <dledford@redhat.com>
Tested-by: Doug Ledford <dledford@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 626a430e32d1..6d00f746c2fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7669,7 +7669,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end, false)) {
-		pr_info("%s: [%lx, %lx) PFNs busy\n",
+		pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
 			__func__, outer_start, end);
 		ret = -EBUSY;
 		goto done;
-- 
cgit v1.2.3


From 5af10dfd0afc559bb4b0f7e3e8227a1578333995 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 10 Aug 2017 15:23:38 -0700
Subject: userfaultfd: hugetlbfs: remove superfluous page unlock in VM_SHARED
 case

huge_add_to_page_cache->add_to_page_cache implicitly unlocks the page
before returning in case of errors.

The error returned was -EEXIST by running UFFDIO_COPY on a non-hole
offset of a VM_SHARED hugetlbfs mapping.  It was an userland bug that
triggered it and the kernel must cope with it returning -EEXIST from
ioctl(UFFDIO_COPY) as expected.

  page dumped because: VM_BUG_ON_PAGE(!PageLocked(page))
  kernel BUG at mm/filemap.c:964!
  invalid opcode: 0000 [#1] SMP
  CPU: 1 PID: 22582 Comm: qemu-system-x86 Not tainted 4.11.11-300.fc26.x86_64 #1
  RIP: unlock_page+0x4a/0x50
  Call Trace:
    hugetlb_mcopy_atomic_pte+0xc0/0x320
    mcopy_atomic+0x96f/0xbe0
    userfaultfd_ioctl+0x218/0xe90
    do_vfs_ioctl+0xa5/0x600
    SyS_ioctl+0x79/0x90
    entry_SYSCALL_64_fastpath+0x1a/0xa9

Link: http://lkml.kernel.org/r/20170802165145.22628-2-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Tested-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Alexey Perevalov <a.perevalov@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a1a0ac0ad6f6..31e207cb399b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4062,9 +4062,9 @@ out:
 	return ret;
 out_release_unlock:
 	spin_unlock(ptl);
-out_release_nounlock:
 	if (vm_shared)
 		unlock_page(page);
+out_release_nounlock:
 	put_page(page);
 	goto out;
 }
-- 
cgit v1.2.3


From a4afe8cdec1646c3d258b02d1cfdfb1313b76eb1 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 10 Aug 2017 15:23:40 -0700
Subject: test_kmod: fix spelling mistake: "EMTPY" -> "EMPTY"

Trivial fix to spelling mistake in snprintf text

[mcgrof@kernel.org: massaged commit message]
Link: http://lkml.kernel.org/r/20170802211450.27928-3-mcgrof@kernel.org
Fixes: 39258f448d71 ("kmod: add test driver to stress test the module loader")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Michal Marek <mmarek@suse.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Binderman <dcb314@hotmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_kmod.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/test_kmod.c b/lib/test_kmod.c
index 6c1d678bcf8b..90c91541fc16 100644
--- a/lib/test_kmod.c
+++ b/lib/test_kmod.c
@@ -485,7 +485,7 @@ static ssize_t config_show(struct device *dev,
 				config->test_driver);
 	else
 		len += snprintf(buf+len, PAGE_SIZE - len,
-				"driver:\tEMTPY\n");
+				"driver:\tEMPTY\n");
 
 	if (config->test_fs)
 		len += snprintf(buf+len, PAGE_SIZE - len,
@@ -493,7 +493,7 @@ static ssize_t config_show(struct device *dev,
 				config->test_fs);
 	else
 		len += snprintf(buf+len, PAGE_SIZE - len,
-				"fs:\tEMTPY\n");
+				"fs:\tEMPTY\n");
 
 	mutex_unlock(&test_dev->config_mutex);
 
-- 
cgit v1.2.3


From 434b06ae23bab4f4111a674f9b64409c87fa8df9 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Thu, 10 Aug 2017 15:23:44 -0700
Subject: test_kmod: fix bug which allows negative values on two config options

Parsing with kstrtol() enables values to be negative, and we failed to
check for negative values when parsing with test_dev_config_update_uint_sync()
or test_dev_config_update_uint_range().

test_dev_config_update_uint_range() has a minimum check though so an
issue is not present there.  test_dev_config_update_uint_sync() is only
used for the number of threads to use (config_num_threads_store()), and
indeed this would fail with an attempt for a large allocation.

Although the issue is only present in practice with the first fix both
by using kstrtoul() instead of kstrtol().

Link: http://lkml.kernel.org/r/20170802211450.27928-4-mcgrof@kernel.org
Fixes: 39258f448d71 ("kmod: add test driver to stress test the module loader")
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: David Binderman <dcb314@hotmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_kmod.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/test_kmod.c b/lib/test_kmod.c
index 90c91541fc16..8fc0a7a19c83 100644
--- a/lib/test_kmod.c
+++ b/lib/test_kmod.c
@@ -880,10 +880,10 @@ static int test_dev_config_update_uint_sync(struct kmod_test_device *test_dev,
 					    int (*test_sync)(struct kmod_test_device *test_dev))
 {
 	int ret;
-	long new;
+	unsigned long new;
 	unsigned int old_val;
 
-	ret = kstrtol(buf, 10, &new);
+	ret = kstrtoul(buf, 10, &new);
 	if (ret)
 		return ret;
 
@@ -918,9 +918,9 @@ static int test_dev_config_update_uint_range(struct kmod_test_device *test_dev,
 					     unsigned int max)
 {
 	int ret;
-	long new;
+	unsigned long new;
 
-	ret = kstrtol(buf, 10, &new);
+	ret = kstrtoul(buf, 10, &new);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 9c56771316ef50992ada284b4c01b03842b2660d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 10 Aug 2017 15:23:47 -0700
Subject: test_kmod: fix the lock in register_test_dev_kmod()

We accidentally just drop the lock twice instead of taking it and then
releasing it.  This isn't a big issue unless you are adding more than
one device to test on, and the kmod.sh doesn't do that yet, however this
obviously is the correct thing to do.

[mcgrof@kernel.org: massaged subject, explain what happens]
Link: http://lkml.kernel.org/r/20170802211450.27928-5-mcgrof@kernel.org
Fixes: 39258f448d71 ("kmod: add test driver to stress test the module loader")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: David Binderman <dcb314@hotmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_kmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/test_kmod.c b/lib/test_kmod.c
index 8fc0a7a19c83..1bc06bbfc97a 100644
--- a/lib/test_kmod.c
+++ b/lib/test_kmod.c
@@ -1146,7 +1146,7 @@ static struct kmod_test_device *register_test_dev_kmod(void)
 	struct kmod_test_device *test_dev = NULL;
 	int ret;
 
-	mutex_unlock(&reg_dev_mutex);
+	mutex_lock(&reg_dev_mutex);
 
 	/* int should suffice for number of devices, test for wrap */
 	if (unlikely(num_test_devs + 1) < 0) {
-- 
cgit v1.2.3


From 4e98ebe5f435fbe5bca91c24bc5d5a2b33025e08 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 10 Aug 2017 15:23:50 -0700
Subject: test_kmod: fix small memory leak on filesystem tests

The break was in the wrong place so file system tests don't work as
intended, leaking memory at each test switch.

[mcgrof@kernel.org: massaged commit subject, noted memory leak issue without the fix]
Link: http://lkml.kernel.org/r/20170802211450.27928-6-mcgrof@kernel.org
Fixes: 39258f448d71 ("kmod: add test driver to stress test the module loader")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Reported-by: David Binderman <dcb314@hotmail.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_kmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/test_kmod.c b/lib/test_kmod.c
index 1bc06bbfc97a..ff9148969b92 100644
--- a/lib/test_kmod.c
+++ b/lib/test_kmod.c
@@ -746,11 +746,11 @@ static int trigger_config_run_type(struct kmod_test_device *test_dev,
 						      strlen(test_str));
 		break;
 	case TEST_KMOD_FS_TYPE:
-		break;
 		kfree_const(config->test_fs);
 		config->test_driver = NULL;
 		copied = config_copy_test_fs(config, test_str,
 					     strlen(test_str));
+		break;
 	default:
 		mutex_unlock(&test_dev->config_mutex);
 		return -EINVAL;
-- 
cgit v1.2.3


From 9eeb52ae712e72141c4c1d173048a606ba8f42f6 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Thu, 10 Aug 2017 15:23:53 -0700
Subject: fault-inject: fix wrong should_fail() decision in task context

Commit 1203c8e6fb0a ("fault-inject: simplify access check for fail-nth")
unintentionally broke a conditional statement in should_fail().  Any
faults are not injected in the task context by the change when the
systematic fault injection is not used.

This change restores to the previous correct behaviour.

Link: http://lkml.kernel.org/r/1501633700-3488-1-git-send-email-akinobu.mita@gmail.com
Fixes: 1203c8e6fb0a ("fault-inject: simplify access check for fail-nth")
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Reported-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
Tested-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/fault-inject.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 7d315fdb9f13..cf7b129b0b2b 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -110,10 +110,12 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
 	if (in_task()) {
 		unsigned int fail_nth = READ_ONCE(current->fail_nth);
 
-		if (fail_nth && !WRITE_ONCE(current->fail_nth, fail_nth - 1))
-			goto fail;
+		if (fail_nth) {
+			if (!WRITE_ONCE(current->fail_nth, fail_nth - 1))
+				goto fail;
 
-		return false;
+			return false;
+		}
 	}
 
 	/* No need to check any other properties if the probability is 0 */
-- 
cgit v1.2.3


From 16af97dc5a8975371a83d9e30a64038b48f40a2d Mon Sep 17 00:00:00 2001
From: Nadav Amit <nadav.amit@gmail.com>
Date: Thu, 10 Aug 2017 15:23:56 -0700
Subject: mm: migrate: prevent racy access to tlb_flush_pending

Patch series "fixes of TLB batching races", v6.

It turns out that Linux TLB batching mechanism suffers from various
races.  Races that are caused due to batching during reclamation were
recently handled by Mel and this patch-set deals with others.  The more
fundamental issue is that concurrent updates of the page-tables allow
for TLB flushes to be batched on one core, while another core changes
the page-tables.  This other core may assume a PTE change does not
require a flush based on the updated PTE value, while it is unaware that
TLB flushes are still pending.

This behavior affects KSM (which may result in memory corruption) and
MADV_FREE and MADV_DONTNEED (which may result in incorrect behavior).  A
proof-of-concept can easily produce the wrong behavior of MADV_DONTNEED.
Memory corruption in KSM is harder to produce in practice, but was
observed by hacking the kernel and adding a delay before flushing and
replacing the KSM page.

Finally, there is also one memory barrier missing, which may affect
architectures with weak memory model.

This patch (of 7):

Setting and clearing mm->tlb_flush_pending can be performed by multiple
threads, since mmap_sem may only be acquired for read in
task_numa_work().  If this happens, tlb_flush_pending might be cleared
while one of the threads still changes PTEs and batches TLB flushes.

This can lead to the same race between migration and
change_protection_range() that led to the introduction of
tlb_flush_pending.  The result of this race was data corruption, which
means that this patch also addresses a theoretically possible data
corruption.

An actual data corruption was not observed, yet the race was was
confirmed by adding assertion to check tlb_flush_pending is not set by
two threads, adding artificial latency in change_protection_range() and
using sysctl to reduce kernel.numa_balancing_scan_delay_ms.

Link: http://lkml.kernel.org/r/20170802000818.4760-2-namit@vmware.com
Fixes: 20841405940e ("mm: fix TLB flush race between migration, and
change_protection_range")
Signed-off-by: Nadav Amit <namit@vmware.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 31 ++++++++++++++++++++++---------
 kernel/fork.c            |  2 +-
 mm/debug.c               |  2 +-
 mm/mprotect.c            |  4 ++--
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f384bb62d8e..f58f76ee1dfa 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -493,7 +493,7 @@ struct mm_struct {
 	 * can move process memory needs to flush the TLB when moving a
 	 * PROT_NONE or PROT_NUMA mapped page.
 	 */
-	bool tlb_flush_pending;
+	atomic_t tlb_flush_pending;
 #endif
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	/* See flush_tlb_batched_pending() */
@@ -532,33 +532,46 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
 {
 	barrier();
-	return mm->tlb_flush_pending;
+	return atomic_read(&mm->tlb_flush_pending) > 0;
 }
-static inline void set_tlb_flush_pending(struct mm_struct *mm)
+
+static inline void init_tlb_flush_pending(struct mm_struct *mm)
 {
-	mm->tlb_flush_pending = true;
+	atomic_set(&mm->tlb_flush_pending, 0);
+}
+
+static inline void inc_tlb_flush_pending(struct mm_struct *mm)
+{
+	atomic_inc(&mm->tlb_flush_pending);
 
 	/*
-	 * Guarantee that the tlb_flush_pending store does not leak into the
+	 * Guarantee that the tlb_flush_pending increase does not leak into the
 	 * critical section updating the page tables
 	 */
 	smp_mb__before_spinlock();
 }
+
 /* Clearing is done after a TLB flush, which also provides a barrier. */
-static inline void clear_tlb_flush_pending(struct mm_struct *mm)
+static inline void dec_tlb_flush_pending(struct mm_struct *mm)
 {
 	barrier();
-	mm->tlb_flush_pending = false;
+	atomic_dec(&mm->tlb_flush_pending);
 }
 #else
 static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
 {
 	return false;
 }
-static inline void set_tlb_flush_pending(struct mm_struct *mm)
+
+static inline void init_tlb_flush_pending(struct mm_struct *mm)
 {
 }
-static inline void clear_tlb_flush_pending(struct mm_struct *mm)
+
+static inline void inc_tlb_flush_pending(struct mm_struct *mm)
+{
+}
+
+static inline void dec_tlb_flush_pending(struct mm_struct *mm)
 {
 }
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 17921b0390b4..e075b7780421 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -807,7 +807,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
 	mmu_notifier_mm_init(mm);
-	clear_tlb_flush_pending(mm);
+	init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
 #endif
diff --git a/mm/debug.c b/mm/debug.c
index db1cd26d8752..d70103bb4731 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -159,7 +159,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
 #endif
 #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
-		mm->tlb_flush_pending,
+		atomic_read(&mm->tlb_flush_pending),
 #endif
 		mm->def_flags, &mm->def_flags
 	);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4180ad8cc9c5..bd0f409922cb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -244,7 +244,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
-	set_tlb_flush_pending(mm);
+	inc_tlb_flush_pending(mm);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
@@ -256,7 +256,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
 	/* Only flush the TLB if we actually modified any entries: */
 	if (pages)
 		flush_tlb_range(vma, start, end);
-	clear_tlb_flush_pending(mm);
+	dec_tlb_flush_pending(mm);
 
 	return pages;
 }
-- 
cgit v1.2.3


From 0a2c40487f3e4215c6ab46e7f837036badfb542b Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Thu, 10 Aug 2017 15:23:59 -0700
Subject: mm: migrate: fix barriers around tlb_flush_pending

Reading tlb_flush_pending while the page-table lock is taken does not
require a barrier, since the lock/unlock already acts as a barrier.
Removing the barrier in mm_tlb_flush_pending() to address this issue.

However, migrate_misplaced_transhuge_page() calls mm_tlb_flush_pending()
while the page-table lock is already released, which may present a
problem on architectures with weak memory model (PPC).  To deal with
this case, a new parameter is added to mm_tlb_flush_pending() to
indicate if it is read without the page-table lock taken, and calling
smp_mb__after_unlock_lock() in this case.

Link: http://lkml.kernel.org/r/20170802000818.4760-3-namit@vmware.com
Signed-off-by: Nadav Amit <namit@vmware.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f58f76ee1dfa..0e478ebd2706 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -526,12 +526,12 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 /*
  * Memory barriers to keep this state in sync are graciously provided by
  * the page table locks, outside of which no page table modifications happen.
- * The barriers below prevent the compiler from re-ordering the instructions
- * around the memory barriers that are already present in the code.
+ * The barriers are used to ensure the order between tlb_flush_pending updates,
+ * which happen while the lock is not taken, and the PTE updates, which happen
+ * while the lock is taken, are serialized.
  */
 static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
 {
-	barrier();
 	return atomic_read(&mm->tlb_flush_pending) > 0;
 }
 
@@ -554,7 +554,13 @@ static inline void inc_tlb_flush_pending(struct mm_struct *mm)
 /* Clearing is done after a TLB flush, which also provides a barrier. */
 static inline void dec_tlb_flush_pending(struct mm_struct *mm)
 {
-	barrier();
+	/*
+	 * Guarantee that the tlb_flush_pending does not not leak into the
+	 * critical section, since we must order the PTE change and changes to
+	 * the pending TLB flush indication. We could have relied on TLB flush
+	 * as a memory barrier, but this behavior is not clearly documented.
+	 */
+	smp_mb__before_atomic();
 	atomic_dec(&mm->tlb_flush_pending);
 }
 #else
-- 
cgit v1.2.3


From a9b802500ebbff1544519a2969323b719dac21f0 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Thu, 10 Aug 2017 15:24:02 -0700
Subject: Revert "mm: numa: defer TLB flush for THP migration as long as
 possible"

While deferring TLB flushes is a good practice, the reverted patch
caused pending TLB flushes to be checked while the page-table lock is
not taken.  As a result, in architectures with weak memory model (PPC),
Linux may miss a memory-barrier, miss the fact TLB flushes are pending,
and cause (in theory) a memory corruption.

Since the alternative of using smp_mb__after_unlock_lock() was
considered a bit open-coded, and the performance impact is expected to
be small, the previous patch is reverted.

This reverts b0943d61b8fa ("mm: numa: defer TLB flush for THP migration
as long as possible").

Link: http://lkml.kernel.org/r/20170802000818.4760-4-namit@vmware.com
Signed-off-by: Nadav Amit <namit@vmware.com>
Suggested-by: Mel Gorman <mgorman@suse.de>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 7 +++++++
 mm/migrate.c     | 6 ------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86975dec0ba1..216114f6ef0b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1495,6 +1495,13 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 		goto clear_pmdnuma;
 	}
 
+	/*
+	 * The page_table_lock above provides a memory barrier
+	 * with change_protection_range.
+	 */
+	if (mm_tlb_flush_pending(vma->vm_mm))
+		flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+
 	/*
 	 * Migrate the THP to the requested node, returns with page unlocked
 	 * and access rights restored.
diff --git a/mm/migrate.c b/mm/migrate.c
index 627671551873..d68a41da6abb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1937,12 +1937,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 		put_page(new_page);
 		goto out_fail;
 	}
-	/*
-	 * We are not sure a pending tlb flush here is for a huge page
-	 * mapping or not. Hence use the tlb range variant
-	 */
-	if (mm_tlb_flush_pending(mm))
-		flush_tlb_range(vma, mmun_start, mmun_end);
 
 	/* Prepare a page as a migration target */
 	__SetPageLocked(new_page);
-- 
cgit v1.2.3


From 56236a59556cfd3bae7bffb7e5f438b5ef0af880 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 10 Aug 2017 15:24:05 -0700
Subject: mm: refactor TLB gathering API

This patch is a preparatory patch for solving race problems caused by
TLB batch.  For that, we will increase/decrease TLB flush pending count
of mm_struct whenever tlb_[gather|finish]_mmu is called.

Before making it simple, this patch separates architecture specific part
and rename it to arch_tlb_[gather|finish]_mmu and generic part just
calls it.

It shouldn't change any behavior.

Link: http://lkml.kernel.org/r/20170802000818.4760-5-namit@vmware.com
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/tlb.h  |  6 ++++--
 arch/ia64/include/asm/tlb.h |  6 ++++--
 arch/s390/include/asm/tlb.h | 12 ++++++------
 arch/sh/include/asm/tlb.h   |  6 ++++--
 arch/um/include/asm/tlb.h   |  8 +++++---
 include/asm-generic/tlb.h   |  7 ++++---
 include/linux/mm_types.h    |  6 ++++++
 mm/memory.c                 | 28 +++++++++++++++++++++-------
 8 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 3f2eb76243e3..7f5b2a2d3861 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -148,7 +148,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
 }
 
 static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+			unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->fullmm = !(start | (end+1));
@@ -166,7 +167,8 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
 }
 
 static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+			unsigned long start, unsigned long end)
 {
 	tlb_flush_mmu(tlb);
 
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index fced197b9626..93cadc04ac62 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -168,7 +168,8 @@ static inline void __tlb_alloc_page(struct mmu_gather *tlb)
 
 
 static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+			unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->max = ARRAY_SIZE(tlb->local);
@@ -185,7 +186,8 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
  * collected.
  */
 static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+			unsigned long start, unsigned long end)
 {
 	/*
 	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 7317b3108a88..d574d0820dc8 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -47,10 +47,9 @@ struct mmu_table_batch {
 extern void tlb_table_flush(struct mmu_gather *tlb);
 extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
 
-static inline void tlb_gather_mmu(struct mmu_gather *tlb,
-				  struct mm_struct *mm,
-				  unsigned long start,
-				  unsigned long end)
+static inline void
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+			unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->start = start;
@@ -76,8 +75,9 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
 	tlb_flush_mmu_free(tlb);
 }
 
-static inline void tlb_finish_mmu(struct mmu_gather *tlb,
-				  unsigned long start, unsigned long end)
+static inline void
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+		unsigned long start, unsigned long end)
 {
 	tlb_flush_mmu(tlb);
 }
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 46e0d635e36f..89786560dbd4 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -36,7 +36,8 @@ static inline void init_tlb_gather(struct mmu_gather *tlb)
 }
 
 static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+		unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->start = start;
@@ -47,7 +48,8 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
 }
 
 static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+		unsigned long start, unsigned long end)
 {
 	if (tlb->fullmm)
 		flush_tlb_mm(tlb->mm);
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h
index 600a2e9bfee2..2a901eca7145 100644
--- a/arch/um/include/asm/tlb.h
+++ b/arch/um/include/asm/tlb.h
@@ -45,7 +45,8 @@ static inline void init_tlb_gather(struct mmu_gather *tlb)
 }
 
 static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+		unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->start = start;
@@ -80,12 +81,13 @@ tlb_flush_mmu(struct mmu_gather *tlb)
 	tlb_flush_mmu_free(tlb);
 }
 
-/* tlb_finish_mmu
+/* arch_tlb_finish_mmu
  *	Called at the end of the shootdown operation to free up any resources
  *	that were required.
  */
 static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+		unsigned long start, unsigned long end)
 {
 	tlb_flush_mmu(tlb);
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 8afa4335e5b2..8f71521e7a44 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -112,10 +112,11 @@ struct mmu_gather {
 
 #define HAVE_GENERIC_MMU_GATHER
 
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end);
+void arch_tlb_gather_mmu(struct mmu_gather *tlb,
+	struct mm_struct *mm, unsigned long start, unsigned long end);
 void tlb_flush_mmu(struct mmu_gather *tlb);
-void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start,
-							unsigned long end);
+void arch_tlb_finish_mmu(struct mmu_gather *tlb,
+			 unsigned long start, unsigned long end);
 extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
 				   int page_size);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0e478ebd2706..c605f2a3a68e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -522,6 +522,12 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 	return mm->cpu_vm_mask_var;
 }
 
+struct mmu_gather;
+extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+				unsigned long start, unsigned long end);
+extern void tlb_finish_mmu(struct mmu_gather *tlb,
+				unsigned long start, unsigned long end);
+
 #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
 /*
  * Memory barriers to keep this state in sync are graciously provided by
diff --git a/mm/memory.c b/mm/memory.c
index f65beaad319b..34cba5113e06 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -215,12 +215,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
 	return true;
 }
 
-/* tlb_gather_mmu
- *	Called to initialize an (on-stack) mmu_gather structure for page-table
- *	tear-down from @mm. The @fullmm argument is used when @mm is without
- *	users and we're going to destroy the full address space (exit/execve).
- */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+				unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 
@@ -275,7 +271,8 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
  *	Called at the end of the shootdown operation to free up any resources
  *	that were required.
  */
-void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+void arch_tlb_finish_mmu(struct mmu_gather *tlb,
+		unsigned long start, unsigned long end)
 {
 	struct mmu_gather_batch *batch, *next;
 
@@ -398,6 +395,23 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 
 #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 
+/* tlb_gather_mmu
+ *	Called to initialize an (on-stack) mmu_gather structure for page-table
+ *	tear-down from @mm. The @fullmm argument is used when @mm is without
+ *	users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+			unsigned long start, unsigned long end)
+{
+	arch_tlb_gather_mmu(tlb, mm, start, end);
+}
+
+void tlb_finish_mmu(struct mmu_gather *tlb,
+		unsigned long start, unsigned long end)
+{
+	arch_tlb_finish_mmu(tlb, start, end);
+}
+
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
-- 
cgit v1.2.3


From 0a2dd266dd6b7a31503b5bbe63af05961a6b446d Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 10 Aug 2017 15:24:09 -0700
Subject: mm: make tlb_flush_pending global

Currently, tlb_flush_pending is used only for CONFIG_[NUMA_BALANCING|
COMPACTION] but upcoming patches to solve subtle TLB flush batching
problem will use it regardless of compaction/NUMA so this patch doesn't
remove the dependency.

[akpm@linux-foundation.org: remove more ifdefs from world's ugliest printk statement]
Link: http://lkml.kernel.org/r/20170802000818.4760-6-namit@vmware.com
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 21 ---------------------
 mm/debug.c               |  4 ----
 2 files changed, 25 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c605f2a3a68e..892a7b0196fd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -487,14 +487,12 @@ struct mm_struct {
 	/* numa_scan_seq prevents two threads setting pte_numa */
 	int numa_scan_seq;
 #endif
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
 	/*
 	 * An operation with batched TLB flushing is going on. Anything that
 	 * can move process memory needs to flush the TLB when moving a
 	 * PROT_NONE or PROT_NUMA mapped page.
 	 */
 	atomic_t tlb_flush_pending;
-#endif
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	/* See flush_tlb_batched_pending() */
 	bool tlb_flush_batched;
@@ -528,7 +526,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 extern void tlb_finish_mmu(struct mmu_gather *tlb,
 				unsigned long start, unsigned long end);
 
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
 /*
  * Memory barriers to keep this state in sync are graciously provided by
  * the page table locks, outside of which no page table modifications happen.
@@ -569,24 +566,6 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm)
 	smp_mb__before_atomic();
 	atomic_dec(&mm->tlb_flush_pending);
 }
-#else
-static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
-{
-	return false;
-}
-
-static inline void init_tlb_flush_pending(struct mm_struct *mm)
-{
-}
-
-static inline void inc_tlb_flush_pending(struct mm_struct *mm)
-{
-}
-
-static inline void dec_tlb_flush_pending(struct mm_struct *mm)
-{
-}
-#endif
 
 struct vm_fault;
 
diff --git a/mm/debug.c b/mm/debug.c
index d70103bb4731..5715448ab0b5 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -124,9 +124,7 @@ void dump_mm(const struct mm_struct *mm)
 #ifdef CONFIG_NUMA_BALANCING
 		"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
 #endif
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
 		"tlb_flush_pending %d\n"
-#endif
 		"def_flags: %#lx(%pGv)\n",
 
 		mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
@@ -158,9 +156,7 @@ void dump_mm(const struct mm_struct *mm)
 #ifdef CONFIG_NUMA_BALANCING
 		mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
 #endif
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
 		atomic_read(&mm->tlb_flush_pending),
-#endif
 		mm->def_flags, &mm->def_flags
 	);
 }
-- 
cgit v1.2.3


From 99baac21e4585f4258f919502c6e23f1e5edc98c Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 10 Aug 2017 15:24:12 -0700
Subject: mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem

Nadav reported parallel MADV_DONTNEED on same range has a stale TLB
problem and Mel fixed it[1] and found same problem on MADV_FREE[2].

Quote from Mel Gorman:
 "The race in question is CPU 0 running madv_free and updating some PTEs
  while CPU 1 is also running madv_free and looking at the same PTEs.
  CPU 1 may have writable TLB entries for a page but fail the pte_dirty
  check (because CPU 0 has updated it already) and potentially fail to
  flush.

  Hence, when madv_free on CPU 1 returns, there are still potentially
  writable TLB entries and the underlying PTE is still present so that a
  subsequent write does not necessarily propagate the dirty bit to the
  underlying PTE any more. Reclaim at some unknown time at the future
  may then see that the PTE is still clean and discard the page even
  though a write has happened in the meantime. I think this is possible
  but I could have missed some protection in madv_free that prevents it
  happening."

This patch aims for solving both problems all at once and is ready for
other problem with KSM, MADV_FREE and soft-dirty story[3].

TLB batch API(tlb_[gather|finish]_mmu] uses [inc|dec]_tlb_flush_pending
and mmu_tlb_flush_pending so that when tlb_finish_mmu is called, we can
catch there are parallel threads going on.  In that case, forcefully,
flush TLB to prevent for user to access memory via stale TLB entry
although it fail to gather page table entry.

I confirmed this patch works with [4] test program Nadav gave so this
patch supersedes "mm: Always flush VMA ranges affected by zap_page_range
v2" in current mmotm.

NOTE:

This patch modifies arch-specific TLB gathering interface(x86, ia64,
s390, sh, um).  It seems most of architecture are straightforward but
s390 need to be careful because tlb_flush_mmu works only if
mm->context.flush_mm is set to non-zero which happens only a pte entry
really is cleared by ptep_get_and_clear and friends.  However, this
problem never changes the pte entries but need to flush to prevent
memory access from stale tlb.

[1] http://lkml.kernel.org/r/20170725101230.5v7gvnjmcnkzzql3@techsingularity.net
[2] http://lkml.kernel.org/r/20170725100722.2dxnmgypmwnrfawp@suse.de
[3] http://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com
[4] https://patchwork.kernel.org/patch/9861621/

[minchan@kernel.org: decrease tlb flush pending count in tlb_finish_mmu]
  Link: http://lkml.kernel.org/r/20170808080821.GA31730@bbox
Link: http://lkml.kernel.org/r/20170802000818.4760-7-namit@vmware.com
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
Reported-by: Nadav Amit <namit@vmware.com>
Reported-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/tlb.h  |  7 ++++++-
 arch/ia64/include/asm/tlb.h |  4 +++-
 arch/s390/include/asm/tlb.h |  7 ++++++-
 arch/sh/include/asm/tlb.h   |  4 ++--
 arch/um/include/asm/tlb.h   |  7 ++++++-
 include/asm-generic/tlb.h   |  2 +-
 include/linux/mm_types.h    |  8 ++++++++
 mm/memory.c                 | 18 ++++++++++++++++--
 8 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 7f5b2a2d3861..d5562f9ce600 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -168,8 +168,13 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-			unsigned long start, unsigned long end)
+			unsigned long start, unsigned long end, bool force)
 {
+	if (force) {
+		tlb->range_start = start;
+		tlb->range_end = end;
+	}
+
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 93cadc04ac62..cbe5ac3699bf 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -187,8 +187,10 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
  */
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-			unsigned long start, unsigned long end)
+			unsigned long start, unsigned long end, bool force)
 {
+	if (force)
+		tlb->need_flush = 1;
 	/*
 	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
 	 * tlb->end_addr.
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index d574d0820dc8..2eb8ff0d6fca 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -77,8 +77,13 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
 
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-		unsigned long start, unsigned long end)
+		unsigned long start, unsigned long end, bool force)
 {
+	if (force) {
+		tlb->start = start;
+		tlb->end = end;
+	}
+
 	tlb_flush_mmu(tlb);
 }
 
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 89786560dbd4..51a8bc967e75 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -49,9 +49,9 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-		unsigned long start, unsigned long end)
+		unsigned long start, unsigned long end, bool force)
 {
-	if (tlb->fullmm)
+	if (tlb->fullmm || force)
 		flush_tlb_mm(tlb->mm);
 
 	/* keep the page table cache within bounds */
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h
index 2a901eca7145..344d95619d03 100644
--- a/arch/um/include/asm/tlb.h
+++ b/arch/um/include/asm/tlb.h
@@ -87,8 +87,13 @@ tlb_flush_mmu(struct mmu_gather *tlb)
  */
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-		unsigned long start, unsigned long end)
+		unsigned long start, unsigned long end, bool force)
 {
+	if (force) {
+		tlb->start = start;
+		tlb->end = end;
+		tlb->need_flush = 1;
+	}
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 8f71521e7a44..faddde44de8c 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -116,7 +116,7 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb,
 	struct mm_struct *mm, unsigned long start, unsigned long end);
 void tlb_flush_mmu(struct mmu_gather *tlb);
 void arch_tlb_finish_mmu(struct mmu_gather *tlb,
-			 unsigned long start, unsigned long end);
+			 unsigned long start, unsigned long end, bool force);
 extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
 				   int page_size);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 892a7b0196fd..3cadee0a3508 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -538,6 +538,14 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
 	return atomic_read(&mm->tlb_flush_pending) > 0;
 }
 
+/*
+ * Returns true if there are two above TLB batching threads in parallel.
+ */
+static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+{
+	return atomic_read(&mm->tlb_flush_pending) > 1;
+}
+
 static inline void init_tlb_flush_pending(struct mm_struct *mm)
 {
 	atomic_set(&mm->tlb_flush_pending, 0);
diff --git a/mm/memory.c b/mm/memory.c
index 34cba5113e06..e158f7ac6730 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -272,10 +272,13 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
  *	that were required.
  */
 void arch_tlb_finish_mmu(struct mmu_gather *tlb,
-		unsigned long start, unsigned long end)
+		unsigned long start, unsigned long end, bool force)
 {
 	struct mmu_gather_batch *batch, *next;
 
+	if (force)
+		__tlb_adjust_range(tlb, start, end - start);
+
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
@@ -404,12 +407,23 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 			unsigned long start, unsigned long end)
 {
 	arch_tlb_gather_mmu(tlb, mm, start, end);
+	inc_tlb_flush_pending(tlb->mm);
 }
 
 void tlb_finish_mmu(struct mmu_gather *tlb,
 		unsigned long start, unsigned long end)
 {
-	arch_tlb_finish_mmu(tlb, start, end);
+	/*
+	 * If there are parallel threads are doing PTE changes on same range
+	 * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
+	 * flush by batching, a thread has stable TLB entry can fail to flush
+	 * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
+	 * forcefully if we detect parallel PTE batching threads.
+	 */
+	bool force = mm_tlb_flush_nested(tlb->mm);
+
+	arch_tlb_finish_mmu(tlb, start, end, force);
+	dec_tlb_flush_pending(tlb->mm);
 }
 
 /*
-- 
cgit v1.2.3


From b3a81d0841a954a3d3e782059960f53e63b37066 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 10 Aug 2017 15:24:15 -0700
Subject: mm: fix KSM data corruption

Nadav reported KSM can corrupt the user data by the TLB batching
race[1].  That means data user written can be lost.

Quote from Nadav Amit:
 "For this race we need 4 CPUs:

  CPU0: Caches a writable and dirty PTE entry, and uses the stale value
  for write later.

  CPU1: Runs madvise_free on the range that includes the PTE. It would
  clear the dirty-bit. It batches TLB flushes.

  CPU2: Writes 4 to /proc/PID/clear_refs , clearing the PTEs soft-dirty.
  We care about the fact that it clears the PTE write-bit, and of
  course, batches TLB flushes.

  CPU3: Runs KSM. Our purpose is to pass the following test in
  write_protect_page():

	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)))

  Since it will avoid TLB flush. And we want to do it while the PTE is
  stale. Later, and before replacing the page, we would be able to
  change the page.

  Note that all the operations the CPU1-3 perform canhappen in parallel
  since they only acquire mmap_sem for read.

  We start with two identical pages. Everything below regards the same
  page/PTE.

  CPU0        CPU1        CPU2        CPU3
  ----        ----        ----        ----
  Write the same
  value on page

  [cache PTE as
   dirty in TLB]

              MADV_FREE
              pte_mkclean()

                          4 > clear_refs
                          pte_wrprotect()

                                      write_protect_page()
                                      [ success, no flush ]

                                      pages_indentical()
                                      [ ok ]

  Write to page
  different value

  [Ok, using stale
   PTE]

                                      replace_page()

  Later, CPU1, CPU2 and CPU3 would flush the TLB, but that is too late.
  CPU0 already wrote on the page, but KSM ignored this write, and it got
  lost"

In above scenario, MADV_FREE is fixed by changing TLB batching API
including [set|clear]_tlb_flush_pending.  Remained thing is soft-dirty
part.

This patch changes soft-dirty uses TLB batching API instead of
flush_tlb_mm and KSM checks pending TLB flush by using
mm_tlb_flush_pending so that it will flush TLB to avoid data lost if
there are other parallel threads pending TLB flush.

[1] http://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com

Link: http://lkml.kernel.org/r/20170802000818.4760-8-namit@vmware.com
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
Reported-by: Nadav Amit <namit@vmware.com>
Tested-by: Nadav Amit <namit@vmware.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 7 +++++--
 mm/ksm.c           | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b836fd61ed87..fe8f3265e877 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -16,9 +16,10 @@
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
 #include <linux/shmem_fs.h>
+#include <linux/uaccess.h>
 
 #include <asm/elf.h>
-#include <linux/uaccess.h>
+#include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
 
@@ -1008,6 +1009,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	enum clear_refs_types type;
+	struct mmu_gather tlb;
 	int itype;
 	int rv;
 
@@ -1054,6 +1056,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		}
 
 		down_read(&mm->mmap_sem);
+		tlb_gather_mmu(&tlb, mm, 0, -1);
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
 			for (vma = mm->mmap; vma; vma = vma->vm_next) {
 				if (!(vma->vm_flags & VM_SOFTDIRTY))
@@ -1075,7 +1078,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
 		if (type == CLEAR_REFS_SOFT_DIRTY)
 			mmu_notifier_invalidate_range_end(mm, 0, -1);
-		flush_tlb_mm(mm);
+		tlb_finish_mmu(&tlb, 0, -1);
 		up_read(&mm->mmap_sem);
 out_mm:
 		mmput(mm);
diff --git a/mm/ksm.c b/mm/ksm.c
index 4dc92f138786..db20f8436bc3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1038,7 +1038,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		goto out_unlock;
 
 	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
-	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) {
+	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
+						mm_tlb_flush_pending(mm)) {
 		pte_t entry;
 
 		swapped = PageSwapCache(page);
-- 
cgit v1.2.3


From c0a6a5ae6b5d976592a83bdafc1c2f49b0718c65 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 10 Aug 2017 15:24:18 -0700
Subject: MAINTAINERS: copy virtio on balloon_compaction.c

Changes to mm/balloon_compaction.c can easily break virtio, and virtio
is the only user of that interface.  Add a line to MAINTAINERS so
whoever changes that file remembers to copy us.

Link: http://lkml.kernel.org/r/1501764010-24456-1-git-send-email-mst@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Acked-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 84d6a8277cbd..6f7721d1634c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14004,6 +14004,7 @@ F:	drivers/block/virtio_blk.c
 F:	include/linux/virtio*.h
 F:	include/uapi/linux/virtio_*.h
 F:	drivers/crypto/virtio/
+F:	mm/balloon_compaction.c
 
 VIRTIO CRYPTO DRIVER
 M:	Gonglei <arei.gonglei@huawei.com>
-- 
cgit v1.2.3


From af54aed94bf3a1cf0b847bbbf00f9a58e278b338 Mon Sep 17 00:00:00 2001
From: Wei Wang <wei.w.wang@intel.com>
Date: Thu, 10 Aug 2017 15:24:21 -0700
Subject: mm/balloon_compaction.c: don't zero ballooned pages

Revert commit bb01b64cfab7 ("mm/balloon_compaction.c: enqueue zero page
to balloon device")'

Zeroing ballon pages is rather time consuming, especially when a lot of
pages are in flight. E.g. 7GB worth of ballooned memory takes 2.8s with
__GFP_ZERO while it takes ~491ms without it.

The original commit argued that zeroing will help ksmd to merge these
pages on the host but this argument is assuming that the host actually
marks balloon pages for ksm which is not universally true.  So we pay
performance penalty for something that even might not be used in the end
which is wrong.  The host can zero out pages on its own when there is a
need.

[mhocko@kernel.org: new changelog text]
Link: http://lkml.kernel.org/r/1501761557-9758-1-git-send-email-wei.w.wang@intel.com
Fixes: bb01b64cfab7 ("mm/balloon_compaction.c: enqueue zero page to balloon device")
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: zhenwei.pi <zhenwei.pi@youruncloud.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/balloon_compaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 9075aa54e955..b06d9fe23a28 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
 {
 	unsigned long flags;
 	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
-				__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO);
+				       __GFP_NOMEMALLOC | __GFP_NORETRY);
 	if (!page)
 		return NULL;
 
-- 
cgit v1.2.3


From d041353dc98a6339182cd6f628b4c8f111278cb3 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 10 Aug 2017 15:24:24 -0700
Subject: mm: fix list corruptions on shmem shrinklist

We saw many list corruption warnings on shmem shrinklist:

  WARNING: CPU: 18 PID: 177 at lib/list_debug.c:59 __list_del_entry+0x9e/0xc0
  list_del corruption. prev->next should be ffff9ae5694b82d8, but was ffff9ae5699ba960
  Modules linked in: intel_rapl sb_edac edac_core x86_pkg_temp_thermal coretemp iTCO_wdt iTCO_vendor_support crct10dif_pclmul crc32_pclmul ghash_clmulni_intel raid0 dcdbas shpchp wmi hed i2c_i801 ioatdma lpc_ich i2c_smbus acpi_cpufreq tcp_diag inet_diag sch_fq_codel ipmi_si ipmi_devintf ipmi_msghandler igb ptp crc32c_intel pps_core i2c_algo_bit i2c_core dca ipv6 crc_ccitt
  CPU: 18 PID: 177 Comm: kswapd1 Not tainted 4.9.34-t3.el7.twitter.x86_64 #1
  Hardware name: Dell Inc. PowerEdge C6220/0W6W6G, BIOS 2.2.3 11/07/2013
  Call Trace:
    dump_stack+0x4d/0x66
    __warn+0xcb/0xf0
    warn_slowpath_fmt+0x4f/0x60
    __list_del_entry+0x9e/0xc0
    shmem_unused_huge_shrink+0xfa/0x2e0
    shmem_unused_huge_scan+0x20/0x30
    super_cache_scan+0x193/0x1a0
    shrink_slab.part.41+0x1e3/0x3f0
    shrink_slab+0x29/0x30
    shrink_node+0xf9/0x2f0
    kswapd+0x2d8/0x6c0
    kthread+0xd7/0xf0
    ret_from_fork+0x22/0x30

  WARNING: CPU: 23 PID: 639 at lib/list_debug.c:33 __list_add+0x89/0xb0
  list_add corruption. prev->next should be next (ffff9ae5699ba960), but was ffff9ae5694b82d8. (prev=ffff9ae5694b82d8).
  Modules linked in: intel_rapl sb_edac edac_core x86_pkg_temp_thermal coretemp iTCO_wdt iTCO_vendor_support crct10dif_pclmul crc32_pclmul ghash_clmulni_intel raid0 dcdbas shpchp wmi hed i2c_i801 ioatdma lpc_ich i2c_smbus acpi_cpufreq tcp_diag inet_diag sch_fq_codel ipmi_si ipmi_devintf ipmi_msghandler igb ptp crc32c_intel pps_core i2c_algo_bit i2c_core dca ipv6 crc_ccitt
  CPU: 23 PID: 639 Comm: systemd-udevd Tainted: G        W       4.9.34-t3.el7.twitter.x86_64 #1
  Hardware name: Dell Inc. PowerEdge C6220/0W6W6G, BIOS 2.2.3 11/07/2013
  Call Trace:
    dump_stack+0x4d/0x66
    __warn+0xcb/0xf0
    warn_slowpath_fmt+0x4f/0x60
    __list_add+0x89/0xb0
    shmem_setattr+0x204/0x230
    notify_change+0x2ef/0x440
    do_truncate+0x5d/0x90
    path_openat+0x331/0x1190
    do_filp_open+0x7e/0xe0
    do_sys_open+0x123/0x200
    SyS_open+0x1e/0x20
    do_syscall_64+0x61/0x170
    entry_SYSCALL64_slow_path+0x25/0x25

The problem is that shmem_unused_huge_shrink() moves entries from the
global sbinfo->shrinklist to its local lists and then releases the
spinlock.  However, a parallel shmem_setattr() could access one of these
entries directly and add it back to the global shrinklist if it is
removed, with the spinlock held.

The logic itself looks solid since an entry could be either in a local
list or the global list, otherwise it is removed from one of them by
list_del_init().  So probably the race condition is that, one CPU is in
the middle of INIT_LIST_HEAD() but the other CPU calls list_empty()
which returns true too early then the following list_add_tail() sees a
corrupted entry.

list_empty_careful() is designed to fix this situation.

[akpm@linux-foundation.org: add comments]
Link: http://lkml.kernel.org/r/20170803054630.18775-1-xiyou.wangcong@gmail.com
Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure")
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index b0aa6075d164..6540e5982444 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1022,7 +1022,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 			 */
 			if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
 				spin_lock(&sbinfo->shrinklist_lock);
-				if (list_empty(&info->shrinklist)) {
+				/*
+				 * _careful to defend against unlocked access to
+				 * ->shrink_list in shmem_unused_huge_shrink()
+				 */
+				if (list_empty_careful(&info->shrinklist)) {
 					list_add_tail(&info->shrinklist,
 							&sbinfo->shrinklist);
 					sbinfo->shrinklist_len++;
@@ -1817,7 +1821,11 @@ alloc_nohuge:		page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
 			 * to shrink under memory pressure.
 			 */
 			spin_lock(&sbinfo->shrinklist_lock);
-			if (list_empty(&info->shrinklist)) {
+			/*
+			 * _careful to defend against unlocked access to
+			 * ->shrink_list in shmem_unused_huge_shrink()
+			 */
+			if (list_empty_careful(&info->shrinklist)) {
 				list_add_tail(&info->shrinklist,
 						&sbinfo->shrinklist);
 				sbinfo->shrinklist_len++;
-- 
cgit v1.2.3


From aac2fea94f7a3df8ad1eeb477eb2643f81fd5393 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 10 Aug 2017 15:24:27 -0700
Subject: rmap: do not call mmu_notifier_invalidate_page() under ptl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MMU notifiers can sleep, but in page_mkclean_one() we call
mmu_notifier_invalidate_page() under page table lock.

Let's instead use mmu_notifier_invalidate_range() outside
page_vma_mapped_walk() loop.

[jglisse@redhat.com: try_to_unmap_one() do not call mmu_notifier under ptl]
  Link: http://lkml.kernel.org/r/20170809204333.27485-1-jglisse@redhat.com
Link: http://lkml.kernel.org/r/20170804134928.l4klfcnqatni7vsc@black.fi.intel.com
Fixes: c7ab0d2fdc84 ("mm: convert try_to_unmap_one() to use page_vma_mapped_walk()")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Reported-by: axie <axie@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "Writer, Tim" <Tim.Writer@amd.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 52 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index c8993c63eb25..c1286d47aa1f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -888,10 +888,10 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 		.flags = PVMW_SYNC,
 	};
 	int *cleaned = arg;
+	bool invalidation_needed = false;
 
 	while (page_vma_mapped_walk(&pvmw)) {
 		int ret = 0;
-		address = pvmw.address;
 		if (pvmw.pte) {
 			pte_t entry;
 			pte_t *pte = pvmw.pte;
@@ -899,11 +899,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 			if (!pte_dirty(*pte) && !pte_write(*pte))
 				continue;
 
-			flush_cache_page(vma, address, pte_pfn(*pte));
-			entry = ptep_clear_flush(vma, address, pte);
+			flush_cache_page(vma, pvmw.address, pte_pfn(*pte));
+			entry = ptep_clear_flush(vma, pvmw.address, pte);
 			entry = pte_wrprotect(entry);
 			entry = pte_mkclean(entry);
-			set_pte_at(vma->vm_mm, address, pte, entry);
+			set_pte_at(vma->vm_mm, pvmw.address, pte, entry);
 			ret = 1;
 		} else {
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -913,11 +913,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 			if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
 				continue;
 
-			flush_cache_page(vma, address, page_to_pfn(page));
-			entry = pmdp_huge_clear_flush(vma, address, pmd);
+			flush_cache_page(vma, pvmw.address, page_to_pfn(page));
+			entry = pmdp_huge_clear_flush(vma, pvmw.address, pmd);
 			entry = pmd_wrprotect(entry);
 			entry = pmd_mkclean(entry);
-			set_pmd_at(vma->vm_mm, address, pmd, entry);
+			set_pmd_at(vma->vm_mm, pvmw.address, pmd, entry);
 			ret = 1;
 #else
 			/* unexpected pmd-mapped page? */
@@ -926,11 +926,16 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 		}
 
 		if (ret) {
-			mmu_notifier_invalidate_page(vma->vm_mm, address);
 			(*cleaned)++;
+			invalidation_needed = true;
 		}
 	}
 
+	if (invalidation_needed) {
+		mmu_notifier_invalidate_range(vma->vm_mm, address,
+				address + (1UL << compound_order(page)));
+	}
+
 	return true;
 }
 
@@ -1323,7 +1328,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	};
 	pte_t pteval;
 	struct page *subpage;
-	bool ret = true;
+	bool ret = true, invalidation_needed = false;
 	enum ttu_flags flags = (enum ttu_flags)arg;
 
 	/* munlock has nothing to gain from examining un-locked vmas */
@@ -1363,11 +1368,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		VM_BUG_ON_PAGE(!pvmw.pte, page);
 
 		subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
-		address = pvmw.address;
-
 
 		if (!(flags & TTU_IGNORE_ACCESS)) {
-			if (ptep_clear_flush_young_notify(vma, address,
+			if (ptep_clear_flush_young_notify(vma, pvmw.address,
 						pvmw.pte)) {
 				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
@@ -1376,7 +1379,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		}
 
 		/* Nuke the page table entry. */
-		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+		flush_cache_page(vma, pvmw.address, pte_pfn(*pvmw.pte));
 		if (should_defer_flush(mm, flags)) {
 			/*
 			 * We clear the PTE but do not flush so potentially
@@ -1386,11 +1389,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * transition on a cached TLB entry is written through
 			 * and traps if the PTE is unmapped.
 			 */
-			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+			pteval = ptep_get_and_clear(mm, pvmw.address,
+						    pvmw.pte);
 
 			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
 		} else {
-			pteval = ptep_clear_flush(vma, address, pvmw.pte);
+			pteval = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
 		}
 
 		/* Move the dirty bit to the page. Now the pte is gone. */
@@ -1405,12 +1409,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			if (PageHuge(page)) {
 				int nr = 1 << compound_order(page);
 				hugetlb_count_sub(nr, mm);
-				set_huge_swap_pte_at(mm, address,
+				set_huge_swap_pte_at(mm, pvmw.address,
 						     pvmw.pte, pteval,
 						     vma_mmu_pagesize(vma));
 			} else {
 				dec_mm_counter(mm, mm_counter(page));
-				set_pte_at(mm, address, pvmw.pte, pteval);
+				set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
 			}
 
 		} else if (pte_unused(pteval)) {
@@ -1434,7 +1438,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			swp_pte = swp_entry_to_pte(entry);
 			if (pte_soft_dirty(pteval))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
-			set_pte_at(mm, address, pvmw.pte, swp_pte);
+			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
 		} else if (PageAnon(page)) {
 			swp_entry_t entry = { .val = page_private(subpage) };
 			pte_t swp_pte;
@@ -1460,7 +1464,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				 * If the page was redirtied, it cannot be
 				 * discarded. Remap the page to page table.
 				 */
-				set_pte_at(mm, address, pvmw.pte, pteval);
+				set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
 				SetPageSwapBacked(page);
 				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
@@ -1468,7 +1472,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			}
 
 			if (swap_duplicate(entry) < 0) {
-				set_pte_at(mm, address, pvmw.pte, pteval);
+				set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
 				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
@@ -1484,14 +1488,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			swp_pte = swp_entry_to_pte(entry);
 			if (pte_soft_dirty(pteval))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
-			set_pte_at(mm, address, pvmw.pte, swp_pte);
+			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
 		} else
 			dec_mm_counter(mm, mm_counter_file(page));
 discard:
 		page_remove_rmap(subpage, PageHuge(page));
 		put_page(page);
-		mmu_notifier_invalidate_page(mm, address);
+		invalidation_needed = true;
 	}
+
+	if (invalidation_needed)
+		mmu_notifier_invalidate_range(mm, address,
+				address + (1UL << compound_order(page)));
 	return ret;
 }
 
-- 
cgit v1.2.3


From f357e345eef7863da037e0243f2d3df4ba6df986 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 10 Aug 2017 15:24:29 -0700
Subject: zram: rework copy of compressor name in comp_algorithm_store()

comp_algorithm_store() passes the size of the source buffer to strlcpy()
instead of the destination buffer size.  Make it explicit that the two
buffers have the same size and use strcpy() instead of strlcpy().  The
latter can be done safely since the function ensures that the string in
the source buffer is terminated.

Link: http://lkml.kernel.org/r/20170803163350.45245-1-mka@chromium.org
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 856d5dc02451..3b1b6340ba13 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -308,7 +308,7 @@ static ssize_t comp_algorithm_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
 	struct zram *zram = dev_to_zram(dev);
-	char compressor[CRYPTO_MAX_ALG_NAME];
+	char compressor[ARRAY_SIZE(zram->compressor)];
 	size_t sz;
 
 	strlcpy(compressor, buf, sizeof(compressor));
@@ -327,7 +327,7 @@ static ssize_t comp_algorithm_store(struct device *dev,
 		return -EBUSY;
 	}
 
-	strlcpy(zram->compressor, compressor, sizeof(compressor));
+	strcpy(zram->compressor, compressor);
 	up_write(&zram->init_lock);
 	return len;
 }
-- 
cgit v1.2.3


From e86b298bebf7e799e4b7232e9135799f1947552e Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Thu, 10 Aug 2017 15:24:32 -0700
Subject: userfaultfd: replace ENOSPC with ESRCH in case mm has gone during
 copy/zeropage

When the process exit races with outstanding mcopy_atomic, it would be
better to return ESRCH error.  When such race occurs the process and
it's mm are going away and returning "no such process" to the uffd
monitor seems better fit than ENOSPC.

Link: http://lkml.kernel.org/r/1502111545-32305-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 06ea26b8c996..b0d5897bc4e6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1600,7 +1600,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 				   uffdio_copy.len);
 		mmput(ctx->mm);
 	} else {
-		return -ENOSPC;
+		return -ESRCH;
 	}
 	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
 		return -EFAULT;
@@ -1647,7 +1647,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 				     uffdio_zeropage.range.len);
 		mmput(ctx->mm);
 	} else {
-		return -ENOSPC;
+		return -ESRCH;
 	}
 	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
 		return -EFAULT;
-- 
cgit v1.2.3