From 784d5699eddc55878627da20d3fe0c8542e2f1a2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 11 Jan 2016 11:04:34 -0500
Subject: x86: move exports to actual definitions

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/include/asm/export.h | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 arch/x86/include/asm/export.h

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h
new file mode 100644
index 000000000000..138de56b13eb
--- /dev/null
+++ b/arch/x86/include/asm/export.h
@@ -0,0 +1,4 @@
+#ifdef CONFIG_64BIT
+#define KSYM_ALIGN 16
+#endif
+#include <asm-generic/export.h>
-- 
cgit v1.2.3


From 8a7e75d47b68193339f8727cf4503271d0a0b1d0 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Date: Tue, 2 Aug 2016 14:03:22 +1000
Subject: KVM: Add provisioning for ulong vm stats and u64 vcpu stats

vms and vcpus have statistics associated with them which can be viewed
within the debugfs. Currently it is assumed within the vcpu_stat_get() and
vm_stat_get() functions that all of these statistics are represented as
u32s, however the next patch adds some u64 vcpu statistics.

Change all vcpu statistics to u64 and modify vcpu_stat_get() accordingly.
Since vcpu statistics are per vcpu, they will only be updated by a single
vcpu at a time so this shouldn't present a problem on 32-bit machines
which can't atomically increment 64-bit numbers. However vm statistics
could potentially be updated by multiple vcpus from that vm at a time.
To avoid the overhead of atomics make all vm statistics ulong such that
they are 64-bit on 64-bit systems where they can be atomically incremented
and are 32-bit on 32-bit systems which may not be able to atomically
increment 64-bit numbers. Modify vm_stat_get() to expect ulongs.

Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/x86/include/asm/kvm_host.h | 72 ++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33ae3a4d0159..67c8f5268af5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -790,45 +790,45 @@ struct kvm_arch {
 };
 
 struct kvm_vm_stat {
-	u32 mmu_shadow_zapped;
-	u32 mmu_pte_write;
-	u32 mmu_pte_updated;
-	u32 mmu_pde_zapped;
-	u32 mmu_flooded;
-	u32 mmu_recycled;
-	u32 mmu_cache_miss;
-	u32 mmu_unsync;
-	u32 remote_tlb_flush;
-	u32 lpages;
+	ulong mmu_shadow_zapped;
+	ulong mmu_pte_write;
+	ulong mmu_pte_updated;
+	ulong mmu_pde_zapped;
+	ulong mmu_flooded;
+	ulong mmu_recycled;
+	ulong mmu_cache_miss;
+	ulong mmu_unsync;
+	ulong remote_tlb_flush;
+	ulong lpages;
 };
 
 struct kvm_vcpu_stat {
-	u32 pf_fixed;
-	u32 pf_guest;
-	u32 tlb_flush;
-	u32 invlpg;
-
-	u32 exits;
-	u32 io_exits;
-	u32 mmio_exits;
-	u32 signal_exits;
-	u32 irq_window_exits;
-	u32 nmi_window_exits;
-	u32 halt_exits;
-	u32 halt_successful_poll;
-	u32 halt_attempted_poll;
-	u32 halt_poll_invalid;
-	u32 halt_wakeup;
-	u32 request_irq_exits;
-	u32 irq_exits;
-	u32 host_state_reload;
-	u32 efer_reload;
-	u32 fpu_reload;
-	u32 insn_emulation;
-	u32 insn_emulation_fail;
-	u32 hypercalls;
-	u32 irq_injections;
-	u32 nmi_injections;
+	u64 pf_fixed;
+	u64 pf_guest;
+	u64 tlb_flush;
+	u64 invlpg;
+
+	u64 exits;
+	u64 io_exits;
+	u64 mmio_exits;
+	u64 signal_exits;
+	u64 irq_window_exits;
+	u64 nmi_window_exits;
+	u64 halt_exits;
+	u64 halt_successful_poll;
+	u64 halt_attempted_poll;
+	u64 halt_poll_invalid;
+	u64 halt_wakeup;
+	u64 request_irq_exits;
+	u64 irq_exits;
+	u64 host_state_reload;
+	u64 efer_reload;
+	u64 fpu_reload;
+	u64 insn_emulation;
+	u64 insn_emulation_fail;
+	u64 hypercalls;
+	u64 irq_injections;
+	u64 nmi_injections;
 };
 
 struct x86_instruction_info;
-- 
cgit v1.2.3


From 5ea11f2b31b83bd3c05357e6bec20f598e00705a Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Tue, 23 Aug 2016 13:52:41 -0500
Subject: svm: Introduces AVIC per-VM ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces per-VM AVIC ID and helper functions to manage the IDs.
Currently, the ID will be used to implement 32-bit AVIC IOMMU GA tag.

The ID is 24-bit one-based indexing value, and is managed via helper
functions to get the next ID, or to free an ID once a VM is destroyed.
There should be no ID conflict for any active VMs.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33ae3a4d0159..d36176bb1b6d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -781,6 +781,7 @@ struct kvm_arch {
 	bool disabled_lapic_found;
 
 	/* Struct members for AVIC */
+	u32 avic_vm_id;
 	u32 ldr_mode;
 	struct page *avic_logical_id_table_page;
 	struct page *avic_physical_id_table_page;
-- 
cgit v1.2.3


From 5881f73757cc3dbada878e67c119a801ed0f9a07 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Tue, 23 Aug 2016 13:52:42 -0500
Subject: svm: Introduce AMD IOMMU avic_ga_log_notifier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces avic_ga_log_notifier, which will be called
by IOMMU driver whenever it handles the Guest vAPIC (GA) log entry.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d36176bb1b6d..4c738c206be3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -785,6 +785,7 @@ struct kvm_arch {
 	u32 ldr_mode;
 	struct page *avic_logical_id_table_page;
 	struct page *avic_physical_id_table_page;
+	struct hlist_node hnode;
 
 	bool x2apic_format;
 	bool x2apic_broadcast_quirk_disabled;
-- 
cgit v1.2.3


From 7d06d9c9bd813fc956b9c7bffc1b9724009983eb Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 29 Jul 2016 09:30:12 -0700
Subject: mm: Implement new pkey_mprotect() system call

pkey_mprotect() is just like mprotect, except it also takes a
protection key as an argument.  On systems that do not support
protection keys, it still works, but requires that key=0.
Otherwise it does exactly what mprotect does.

I expect it to get used like this, if you want to guarantee that
any mapping you create can *never* be accessed without the right
protection keys set up.

	int real_prot = PROT_READ|PROT_WRITE;
	pkey = pkey_alloc(0, PKEY_DENY_ACCESS);
	ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
	ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);

This way, there is *no* window where the mapping is accessible
since it was always either PROT_NONE or had a protection key set
that denied all access.

We settled on 'unsigned long' for the type of the key here.  We
only need 4 bits on x86 today, but I figured that other
architectures might need some more space.

Semantically, we have a bit of a problem if we combine this
syscall with our previously-introduced execute-only support:
What do we do when we mix execute-only pkey use with
pkey_mprotect() use?  For instance:

	pkey_mprotect(ptr, PAGE_SIZE, PROT_WRITE, 6); // set pkey=6
	mprotect(ptr, PAGE_SIZE, PROT_EXEC);  // set pkey=X_ONLY_PKEY?
	mprotect(ptr, PAGE_SIZE, PROT_WRITE); // is pkey=6 again?

To solve that, we make the plain-mprotect()-initiated execute-only
support only apply to VMAs that have the default protection key (0)
set on them.

Proposed semantics:
1. protection key 0 is special and represents the default,
   "unassigned" protection key.  It is always allocated.
2. mprotect() never affects a mapping's pkey_mprotect()-assigned
   protection key. A protection key of 0 (even if set explicitly)
   represents an unassigned protection key.
   2a. mprotect(PROT_EXEC) on a mapping with an assigned protection
       key may or may not result in a mapping with execute-only
       properties.  pkey_mprotect() plus pkey_set() on all threads
       should be used to _guarantee_ execute-only semantics if this
       is not a strong enough semantic.
3. mprotect(PROT_EXEC) may result in an "execute-only" mapping. The
   kernel will internally attempt to allocate and dedicate a
   protection key for the purpose of execute-only mappings.  This
   may not be possible in cases where there are no free protection
   keys available.  It can also happen, of course, in situations
   where there is no hardware support for protection keys.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: linux-arch@vger.kernel.org
Cc: Dave Hansen <dave@sr71.net>
Cc: arnd@arndb.de
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: luto@kernel.org
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20160729163012.3DDD36C4@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/mmu_context.h | 15 ++++++++++-----
 arch/x86/include/asm/pkeys.h       | 11 +++++++++--
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d8abfcf524d1..af0251fc85ed 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -4,6 +4,7 @@
 #include <asm/desc.h>
 #include <linux/atomic.h>
 #include <linux/mm_types.h>
+#include <linux/pkeys.h>
 
 #include <trace/events/tlb.h>
 
@@ -195,16 +196,20 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 		mpx_notify_unmap(mm, vma, start, end);
 }
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
-	u16 pkey = 0;
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
 				      VM_PKEY_BIT2 | VM_PKEY_BIT3;
-	pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
-#endif
-	return pkey;
+
+	return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+}
+#else
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+	return 0;
 }
+#endif
 
 static inline bool __pkru_allows_pkey(u16 pkey, bool write)
 {
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 7b84565c916c..33777c291a85 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -1,7 +1,12 @@
 #ifndef _ASM_X86_PKEYS_H
 #define _ASM_X86_PKEYS_H
 
-#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
+#define PKEY_DEDICATED_EXECUTE_ONLY 15
+/*
+ * Consider the PKEY_DEDICATED_EXECUTE_ONLY key unavailable.
+ */
+#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? \
+		PKEY_DEDICATED_EXECUTE_ONLY : 1)
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
@@ -10,7 +15,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
  * Try to dedicate one of the protection keys to be used as an
  * execute-only protection key.
  */
-#define PKEY_DEDICATED_EXECUTE_ONLY 15
 extern int __execute_only_pkey(struct mm_struct *mm);
 static inline int execute_only_pkey(struct mm_struct *mm)
 {
@@ -31,4 +35,7 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
 	return __arch_override_mprotect_pkey(vma, prot, pkey);
 }
 
+extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+		unsigned long init_val);
+
 #endif /*_ASM_X86_PKEYS_H */
-- 
cgit v1.2.3


From a8502b67d739c1d7a4542c1da0a5d98a6a58c177 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 29 Jul 2016 09:30:13 -0700
Subject: x86/pkeys: Make mprotect_key() mask off additional vm_flags

Today, mprotect() takes 4 bits of data: PROT_READ/WRITE/EXEC/NONE.
Three of those bits: READ/WRITE/EXEC get translated directly in to
vma->vm_flags by calc_vm_prot_bits().  If a bit is unset in
mprotect()'s 'prot' argument then it must be cleared in vma->vm_flags
during the mprotect() call.

We do this clearing today by first calculating the VMA flags we
want set, then clearing the ones we do not want to inherit from
the original VMA:

	vm_flags = calc_vm_prot_bits(prot, key);
	...
	newflags = vm_flags;
	newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));

However, we *also* want to mask off the original VMA's vm_flags in
which we store the protection key.

To do that, this patch adds a new macro:

	ARCH_VM_PKEY_FLAGS

which allows the architecture to specify additional bits that it would
like cleared.  We use that to ensure that the VM_PKEY_BIT* bits get
cleared.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Dave Hansen <dave@sr71.net>
Cc: arnd@arndb.de
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: luto@kernel.org
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20160729163013.E48D6981@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/pkeys.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 33777c291a85..666ffc862ef7 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -38,4 +38,6 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
 extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 
+#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
+
 #endif /*_ASM_X86_PKEYS_H */
-- 
cgit v1.2.3


From e8c24d3a23a469f1f40d4de24d872ca7023ced0a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 29 Jul 2016 09:30:15 -0700
Subject: x86/pkeys: Allocation/free syscalls

This patch adds two new system calls:

	int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
	int pkey_free(int pkey);

These implement an "allocator" for the protection keys
themselves, which can be thought of as analogous to the allocator
that the kernel has for file descriptors.  The kernel tracks
which numbers are in use, and only allows operations on keys that
are valid.  A key which was not obtained by pkey_alloc() may not,
for instance, be passed to pkey_mprotect().

These system calls are also very important given the kernel's use
of pkeys to implement execute-only support.  These help ensure
that userspace can never assume that it has control of a key
unless it first asks the kernel.  The kernel does not promise to
preserve PKRU (right register) contents except for allocated
pkeys.

The 'init_access_rights' argument to pkey_alloc() specifies the
rights that will be established for the returned pkey.  For
instance:

	pkey = pkey_alloc(flags, PKEY_DENY_WRITE);

will allocate 'pkey', but also sets the bits in PKRU[1] such that
writing to 'pkey' is already denied.

The kernel does not prevent pkey_free() from successfully freeing
in-use pkeys (those still assigned to a memory range by
pkey_mprotect()).  It would be expensive to implement the checks
for this, so we instead say, "Just don't do it" since sane
software will never do it anyway.

Any piece of userspace calling pkey_alloc() needs to be prepared
for it to fail.  Why?  pkey_alloc() returns the same error code
(ENOSPC) when there are no pkeys and when pkeys are unsupported.
They can be unsupported for a whole host of reasons, so apps must
be prepared for this.  Also, libraries or LD_PRELOADs might steal
keys before an application gets access to them.

This allocation mechanism could be implemented in userspace.
Even if we did it in userspace, we would still need additional
user/kernel interfaces to tell userspace which keys are being
used by the kernel internally (such as for execute-only
mappings).  Having the kernel provide this facility completely
removes the need for these additional interfaces, or having an
implementation of this in userspace at all.

Note that we have to make changes to all of the architectures
that do not use mman-common.h because we use the new
PKEY_DENY_ACCESS/WRITE macros in arch-independent code.

1. PKRU is the Protection Key Rights User register.  It is a
   usermode-accessible register that controls whether writes
   and/or access to each individual pkey is allowed or denied.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: linux-arch@vger.kernel.org
Cc: Dave Hansen <dave@sr71.net>
Cc: arnd@arndb.de
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: luto@kernel.org
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20160729163015.444FE75F@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/mmu.h         |  8 +++++
 arch/x86/include/asm/mmu_context.h | 10 +++++-
 arch/x86/include/asm/pkeys.h       | 73 ++++++++++++++++++++++++++++++++++----
 3 files changed, 84 insertions(+), 7 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 1ea0baef1175..72198c64e646 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -23,6 +23,14 @@ typedef struct {
 	const struct vdso_image *vdso_image;	/* vdso image in use */
 
 	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+	/*
+	 * One bit per protection key says whether userspace can
+	 * use it or not.  protected by mmap_sem.
+	 */
+	u16 pkey_allocation_map;
+	s16 execute_only_pkey;
+#endif
 } mm_context_t;
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index af0251fc85ed..8e0a9fe86de4 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -108,7 +108,16 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
+	#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
+		/* pkey 0 is the default and always allocated */
+		mm->context.pkey_allocation_map = 0x1;
+		/* -1 means unallocated or invalid */
+		mm->context.execute_only_pkey = -1;
+	}
+	#endif
 	init_new_context_ldt(tsk, mm);
+
 	return 0;
 }
 static inline void destroy_context(struct mm_struct *mm)
@@ -263,5 +272,4 @@ static inline bool arch_pte_access_permitted(pte_t pte, bool write)
 {
 	return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
 }
-
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 666ffc862ef7..b406889de0db 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -1,12 +1,7 @@
 #ifndef _ASM_X86_PKEYS_H
 #define _ASM_X86_PKEYS_H
 
-#define PKEY_DEDICATED_EXECUTE_ONLY 15
-/*
- * Consider the PKEY_DEDICATED_EXECUTE_ONLY key unavailable.
- */
-#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? \
-		PKEY_DEDICATED_EXECUTE_ONLY : 1)
+#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
@@ -40,4 +35,70 @@ extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 
 #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
 
+#define mm_pkey_allocation_map(mm)	(mm->context.pkey_allocation_map)
+#define mm_set_pkey_allocated(mm, pkey) do {		\
+	mm_pkey_allocation_map(mm) |= (1U << pkey);	\
+} while (0)
+#define mm_set_pkey_free(mm, pkey) do {			\
+	mm_pkey_allocation_map(mm) &= ~(1U << pkey);	\
+} while (0)
+
+static inline
+bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
+{
+	return mm_pkey_allocation_map(mm) & (1U << pkey);
+}
+
+/*
+ * Returns a positive, 4-bit key on success, or -1 on failure.
+ */
+static inline
+int mm_pkey_alloc(struct mm_struct *mm)
+{
+	/*
+	 * Note: this is the one and only place we make sure
+	 * that the pkey is valid as far as the hardware is
+	 * concerned.  The rest of the kernel trusts that
+	 * only good, valid pkeys come out of here.
+	 */
+	u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
+	int ret;
+
+	/*
+	 * Are we out of pkeys?  We must handle this specially
+	 * because ffz() behavior is undefined if there are no
+	 * zeros.
+	 */
+	if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
+		return -1;
+
+	ret = ffz(mm_pkey_allocation_map(mm));
+
+	mm_set_pkey_allocated(mm, ret);
+
+	return ret;
+}
+
+static inline
+int mm_pkey_free(struct mm_struct *mm, int pkey)
+{
+	/*
+	 * pkey 0 is special, always allocated and can never
+	 * be freed.
+	 */
+	if (!pkey)
+		return -EINVAL;
+	if (!mm_pkey_is_allocated(mm, pkey))
+		return -EINVAL;
+
+	mm_set_pkey_free(mm, pkey);
+
+	return 0;
+}
+
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+		unsigned long init_val);
+extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+		unsigned long init_val);
+
 #endif /*_ASM_X86_PKEYS_H */
-- 
cgit v1.2.3


From acd547b29880800d29222c4632d2c145e401988c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 29 Jul 2016 09:30:21 -0700
Subject: x86/pkeys: Default to a restrictive init PKRU

PKRU is the register that lets you disallow writes or all access to a given
protection key.

The XSAVE hardware defines an "init state" of 0 for PKRU: its most
permissive state, allowing access/writes to everything.  Since we start off
all new processes with the init state, we start all processes off with the
most permissive possible PKRU.

This is unfortunate.  If a thread is clone()'d [1] before a program has
time to set PKRU to a restrictive value, that thread will be able to write
to all data, no matter what pkey is set on it.  This weakens any integrity
guarantees that we want pkeys to provide.

To fix this, we define a very restrictive PKRU to override the
XSAVE-provided value when we create a new FPU context.  We choose a value
that only allows access to pkey 0, which is as restrictive as we can
practically make it.

This does not cause any practical problems with applications using
protection keys because we require them to specify initial permissions for
each key when it is allocated, which override the restrictive default.

In the end, this ensures that threads which do not know how to manage their
own pkey rights can not do damage to data which is pkey-protected.

I would have thought this was a pretty contrived scenario, except that I
heard a bug report from an MPX user who was creating threads in some very
early code before main().  It may be crazy, but folks evidently _do_ it.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: linux-arch@vger.kernel.org
Cc: Dave Hansen <dave@sr71.net>
Cc: mgorman@techsingularity.net
Cc: arnd@arndb.de
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: luto@kernel.org
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20160729163021.F3C25D4A@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/pkeys.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index b406889de0db..34684adb6899 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -100,5 +100,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
+extern void copy_init_pkru_to_fpregs(void);
 
 #endif /*_ASM_X86_PKEYS_H */
-- 
cgit v1.2.3


From a545ab6a0085e6df9c7b6e9734b40ba4d2aca8c9 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Wed, 7 Sep 2016 14:47:19 -0400
Subject: kvm: x86: add tsc_offset field to struct kvm_vcpu_arch

A future commit will want to easily read a vCPU's TSC offset,
so we store it in struct kvm_arch_vcpu_arch for easy access.

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cd82bf74e7a5..ddffcfbe155c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -568,6 +568,7 @@ struct kvm_vcpu_arch {
 		struct kvm_steal_time steal;
 	} st;
 
+	u64 tsc_offset;
 	u64 last_guest_tsc;
 	u64 last_host_tsc;
 	u64 tsc_offset_adjustment;
-- 
cgit v1.2.3


From 3e3f50262eb441d0fd1de4dce06739e9c0fe7c61 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Wed, 7 Sep 2016 14:47:20 -0400
Subject: kvm: x86: drop read_tsc_offset()

The TSC offset can now be read directly from struct kvm_arch_vcpu.

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ddffcfbe155c..32a43a25d415 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -954,7 +954,6 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
-	u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
-- 
cgit v1.2.3


From 108b249c453dd7132599ab6dc7e435a7036c193f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 1 Sep 2016 14:21:03 +0200
Subject: KVM: x86: introduce get_kvmclock_ns

Introduce a function that reads the exact nanoseconds value that is
provided to the guest in kvmclock.  This crystallizes the notion of
kvmclock as a thin veneer over a stable TSC, that the guest will
(hopefully) convert with NTP.  In other words, kvmclock is *not* a
paravirtualized host-to-guest NTP.

Drop the get_kernel_ns() function, that was used both to get the base
value of the master clock and to get the current value of kvmclock.
The former use is replaced by ktime_get_boot_ns(), the latter is
the purpose of get_kernel_ns().

This also allows KVM to provide a Hyper-V time reference counter that
is synchronized with the time that is computed from the TSC page.

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/pvclock.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d019f0cc80ec..3ad741b84072 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 }
 
 static __always_inline
-cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+			      u64 tsc)
 {
-	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
+	u64 delta = tsc - src->tsc_timestamp;
 	cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
 					     src->tsc_shift);
 	return src->system_time + offset;
-- 
cgit v1.2.3


From 095cf55df715d14d5dad75326faf5984e7fc0b3a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 8 Feb 2016 12:54:12 +0100
Subject: KVM: x86: Hyper-V tsc page setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lately tsc page was implemented but filled with empty
values. This patch setup tsc page scale and offset based
on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.

The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
reads count to zero which potentially improves performance.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Peter Hornyack <peterhornyack@google.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Roman Kagan <rkagan@virtuozzo.com>
CC: Denis V. Lunev <den@openvz.org>
[Computation of TSC page parameters rewritten to use the Linux timekeeper
 parameters. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 32a43a25d415..4b20f7304b9c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -702,6 +702,8 @@ struct kvm_hv {
 	/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
 	u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
 	u64 hv_crash_ctl;
+
+	HV_REFERENCE_TSC_PAGE tsc_ref;
 };
 
 struct kvm_arch {
-- 
cgit v1.2.3


From 799bc3c51b2b120ca6e59e702ede23fff2efaf43 Mon Sep 17 00:00:00 2001
From: Lance Richardson <lrichard@redhat.com>
Date: Thu, 22 Sep 2016 10:03:57 -0400
Subject: percpu: eliminate two sparse warnings

Fix two cases where a __percpu pointer cast drops __percpu.

Signed-off-by: Lance Richardson <lrichard@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/percpu.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e02e3f80d363..84f58de08c2b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -521,7 +521,8 @@ do {									\
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
                         const unsigned long __percpu *addr)
 {
-	unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+	unsigned long __percpu *a =
+		(unsigned long __percpu *)addr + nr / BITS_PER_LONG;
 
 #ifdef CONFIG_X86_64
 	return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
@@ -538,7 +539,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
 	asm volatile("bt "__percpu_arg(2)",%1\n\t"
 			CC_SET(c)
 			: CC_OUT(c) (oldbit)
-			: "m" (*(unsigned long *)addr), "Ir" (nr));
+			: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
 
 	return oldbit;
 }
-- 
cgit v1.2.3


From 3161832d58c7f3bf8b190a2887086be0932d8dd3 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 13 Sep 2016 09:05:40 -0600
Subject: x86/PCI: VMD: Request userspace control of PCIe hotplug indicators

Add set_dev_domain_options() to set PCI domain-specific options as devices
are added.  The first usage is to request exclusive userspace control of
PCIe hotplug indicators in VMD domains.

Devices in a VMD domain use PCIe hotplug Attention and Power Indicators in
a non-standard way; tell pciehp to ignore the indicators so userspace can
control them via the sysfs "attention" file.

To determine whether a bus is within a VMD domain, add a bool to the
pci_sysdata structure that the VMD driver sets during initialization.

[bhelgaas: changelog]
Requested-by: Kapil Karkra <kapil.karkra@intel.com>
Tested-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/include/asm/pci.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 9ab7507ca1c2..1411dbed5e5e 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -23,6 +23,9 @@ struct pci_sysdata {
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 	void		*fwnode;	/* IRQ domain for MSI assignment */
 #endif
+#if IS_ENABLED(CONFIG_VMD)
+	bool vmd_domain;		/* True if in Intel VMD domain */
+#endif
 };
 
 extern int pci_routeirq;
@@ -56,6 +59,17 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
 #define pci_root_bus_fwnode	_pci_root_bus_fwnode
 #endif
 
+static inline bool is_vmd(struct pci_bus *bus)
+{
+#if IS_ENABLED(CONFIG_VMD)
+	struct pci_sysdata *sd = bus->sysdata;
+
+	return sd->vmd_domain;
+#else
+	return false;
+#endif
+}
+
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
    or architectures with incomplete PCI setup by the loader */
-- 
cgit v1.2.3


From b79d8d82bb994fe6154ea1a8e56d587ef9e356ae Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 5 Sep 2016 11:27:55 -0400
Subject: remove stray include of asm/uaccess.h from cacheflush.h

It was introduced in "arch, x86: pmem api for ensuring durability
of persistent memory updates" in July 2015, along with the code
that really used that stuff.  Three weeks later all that code
got moved to asm/pmem.h by "pmem, x86: move x86 PMEM API to new
pmem.h header"; include, however, was left behind.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/include/asm/cacheflush.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 61518cf79437..872877d930de 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,7 +4,6 @@
 /* Caches aren't brain-dead on the intel. */
 #include <asm-generic/cacheflush.h>
 #include <asm/special_insns.h>
-#include <asm/uaccess.h>
 
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
-- 
cgit v1.2.3


From 45caf470077ae6b02da6d5eaee94003ee1543ca3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 5 Sep 2016 11:32:44 -0400
Subject: x86: separate extable.h, switch sections.h to it

drivers/platform/x86/dell-smo8800.c is touched due to the following obscenity:
drivers/platform/x86/dell-smo8800.c ->
	linux/interrupt.h ->
		linux/hardirq.h ->
			asm/hardirq.h ->
				linux/irq.h ->
					asm/hw_irq.h ->
						asm/sections.h ->
							asm/uaccess.h
is the only chain of includes pulling asm/uaccess.h there.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/include/asm/extable.h  | 35 +++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/sections.h |  2 +-
 arch/x86/include/asm/uaccess.h  | 32 +-------------------------------
 3 files changed, 37 insertions(+), 32 deletions(-)
 create mode 100644 arch/x86/include/asm/extable.h

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
new file mode 100644
index 000000000000..b8ad261d11dc
--- /dev/null
+++ b/arch/x86/include/asm/extable.h
@@ -0,0 +1,35 @@
+#ifndef _ASM_X86_EXTABLE_H
+#define _ASM_X86_EXTABLE_H
+/*
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path.  This means when everything is well,
+ * we don't even have to jump over them.  Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+	int insn, fixup, handler;
+};
+struct pt_regs;
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+#define swap_ex_entry_fixup(a, b, tmp, delta)			\
+	do {							\
+		(a)->fixup = (b)->fixup + (delta);		\
+		(b)->fixup = (tmp).fixup - (delta);		\
+		(a)->handler = (b)->handler + (delta);		\
+		(b)->handler = (tmp).handler - (delta);		\
+	} while (0)
+
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
+extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
+
+#endif
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 13b6cdd0af57..2f75f30cb2f6 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,7 @@
 #define _ASM_X86_SECTIONS_H
 
 #include <asm-generic/sections.h>
-#include <asm/uaccess.h>
+#include <asm/extable.h>
 
 extern char __brk_base[], __brk_limit[];
 extern struct exception_table_entry __stop___ex_table[];
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 2131c4ce7d8a..faf3687f1035 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -11,6 +11,7 @@
 #include <asm/asm.h>
 #include <asm/page.h>
 #include <asm/smap.h>
+#include <asm/extable.h>
 
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
@@ -90,37 +91,6 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 #define access_ok(type, addr, size) \
 	likely(!__range_not_ok(addr, size, user_addr_max()))
 
-/*
- * The exception table consists of triples of addresses relative to the
- * exception table entry itself. The first address is of an instruction
- * that is allowed to fault, the second is the target at which the program
- * should continue. The third is a handler function to deal with the fault
- * caused by the instruction in the first field.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
-	int insn, fixup, handler;
-};
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-#define swap_ex_entry_fixup(a, b, tmp, delta)			\
-	do {							\
-		(a)->fixup = (b)->fixup + (delta);		\
-		(b)->fixup = (tmp).fixup - (delta);		\
-		(a)->handler = (b)->handler + (delta);		\
-		(b)->handler = (tmp).handler - (delta);		\
-	} while (0)
-
-extern int fixup_exception(struct pt_regs *regs, int trapnr);
-extern bool ex_has_fault_handler(unsigned long ip);
-extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
-
 /*
  * These are the main single-value transfer routines.  They automatically
  * use the right size if we just have the right pointer type.
-- 
cgit v1.2.3


From 72a9b186292d98494f222226cfd24a1621796209 Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Fri, 26 Aug 2016 23:55:36 +0200
Subject: xen: Remove event channel notification through Xen PCI platform
 device

Ever since commit 254d1a3f02eb ("xen/pv-on-hvm kexec: shutdown watches
from old kernel") using the INTx interrupt from Xen PCI platform
device for event channel notification would just lockup the guest
during bootup.  postcore_initcall now calls xs_reset_watches which
will eventually try to read a value from XenStore and will get stuck
on read_reply at XenBus forever since the platform driver is not
probed yet and its INTx interrupt handler is not registered yet. That
means that the guest can not be notified at this moment of any pending
event channels and none of the per-event handlers will ever be invoked
(including the XenStore one) and the reply will never be picked up by
the kernel.

The exact stack where things get stuck during xenbus_init:

-xenbus_init
 -xs_init
  -xs_reset_watches
   -xenbus_scanf
    -xenbus_read
     -xs_single
      -xs_single
       -xs_talkv

Vector callbacks have always been the favourite event notification
mechanism since their introduction in commit 38e20b07efd5 ("x86/xen:
event channels delivery on HVM.") and the vector callback feature has
always been advertised for quite some time by Xen that's why INTx was
broken for several years now without impacting anyone.

Luckily this also means that event channel notification through INTx
is basically dead-code which can be safely removed without impacting
anybody since it has been effectively disabled for more than 4 years
with nobody complaining about it (at least as far as I'm aware of).

This commit removes event channel notification through Xen PCI
platform device.

Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Julien Grall <julien.grall@citrix.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Ross Lagerwall <ross.lagerwall@citrix.com>
Cc: xen-devel@lists.xenproject.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Cc: Anthony Liguori <aliguori@amazon.com>
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/include/asm/xen/events.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index e6911caf5bbf..608a79d5a466 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,15 +20,4 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
 /* No need for a barrier -- XCHG is a barrier on x86. */
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
 
-extern int xen_have_vector_callback;
-
-/*
- * Events delivered via platform PCI interrupts are always
- * routed to vcpu 0 and hence cannot be rebound.
- */
-static inline bool xen_support_evtchn_rebind(void)
-{
-	return (!xen_hvm_domain() || xen_have_vector_callback);
-}
-
 #endif /* _ASM_X86_XEN_EVENTS_H */
-- 
cgit v1.2.3


From 08ea8c07fb56d6eb8194d8ad408b469544bf2c29 Mon Sep 17 00:00:00 2001
From: Baoyou Xie <baoyou.xie@linaro.org>
Date: Fri, 7 Oct 2016 17:00:55 -0700
Subject: mm: move phys_mem_access_prot_allowed() declaration to pgtable.h

We get 1 warning when building kernel with W=1:

  drivers/char/mem.c:220:12: warning: no previous prototype for 'phys_mem_access_prot_allowed' [-Wmissing-prototypes]
   int __weak phys_mem_access_prot_allowed(struct file *file,

In fact, its declaration is spreading to several header files in
different architecture, but need to be declare in common header file.

So this patch moves phys_mem_access_prot_allowed() to pgtable.h.

Link: http://lkml.kernel.org/r/1473751597-12139-1-git-send-email-baoyou.xie@linaro.org
Signed-off-by: Baoyou Xie <baoyou.xie@linaro.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable_types.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f1218f512f62..8b4de22d6429 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -439,8 +439,6 @@ extern pgprot_t pgprot_writethrough(pgprot_t prot);
 struct file;
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                               unsigned long size, pgprot_t vma_prot);
-int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
-                              unsigned long size, pgprot_t *vma_prot);
 
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
-- 
cgit v1.2.3


From 9a01c3ed5cdb35d9004eb92510ee6ea11b4a5f16 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@mellanox.com>
Date: Fri, 7 Oct 2016 17:02:45 -0700
Subject: nmi_backtrace: add more trigger_*_cpu_backtrace() methods

Patch series "improvements to the nmi_backtrace code" v9.

This patch series modifies the trigger_xxx_backtrace() NMI-based remote
backtracing code to make it more flexible, and makes a few small
improvements along the way.

The motivation comes from the task isolation code, where there are
scenarios where we want to be able to diagnose a case where some cpu is
about to interrupt a task-isolated cpu.  It can be helpful to see both
where the interrupting cpu is, and also an approximation of where the
cpu that is being interrupted is.  The nmi_backtrace framework allows us
to discover the stack of the interrupted cpu.

I've tested that the change works as desired on tile, and build-tested
x86, arm, mips, and sparc64.  For x86 I confirmed that the generic
cpuidle stuff as well as the architecture-specific routines are in the
new cpuidle section.  For arm, mips, and sparc I just build-tested it
and made sure the generic cpuidle routines were in the new cpuidle
section, but I didn't attempt to figure out which the platform-specific
idle routines might be.  That might be more usefully done by someone
with platform experience in follow-up patches.

This patch (of 4):

Currently you can only request a backtrace of either all cpus, or all
cpus but yourself.  It can also be helpful to request a remote backtrace
of a single cpu, and since we want that, the logical extension is to
support a cpumask as the underlying primitive.

This change modifies the existing lib/nmi_backtrace.c code to take a
cpumask as its basic primitive, and modifies the linux/nmi.h code to use
the new "cpumask" method instead.

The existing clients of nmi_backtrace (arm and x86) are converted to
using the new cpumask approach in this change.

The other users of the backtracing API (sparc64 and mips) are converted
to use the cpumask approach rather than the all/allbutself approach.
The mips code ignored the "include_self" boolean but with this change it
will now also dump a local backtrace if requested.

Link: http://lkml.kernel.org/r/1472487169-14923-2-git-send-email-cmetcalf@mellanox.com
Signed-off-by: Chris Metcalf <cmetcalf@mellanox.com>
Tested-by: Daniel Thompson <daniel.thompson@linaro.org> [arm]
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/irq.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index e7de5c9a4fbd..16d3fa211962 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -50,8 +50,9 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
 extern void init_ISA_irqs(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+				    bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
 #endif
 
 #endif /* _ASM_X86_IRQ_H */
-- 
cgit v1.2.3


From 6727ad9e206cc08b80d8000a4d67f8417e53539d Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@mellanox.com>
Date: Fri, 7 Oct 2016 17:02:55 -0700
Subject: nmi_backtrace: generate one-line reports for idle cpus

When doing an nmi backtrace of many cores, most of which are idle, the
output is a little overwhelming and very uninformative.  Suppress
messages for cpus that are idling when they are interrupted and just
emit one line, "NMI backtrace for N skipped: idling at pc 0xNNN".

We do this by grouping all the cpuidle code together into a new
.cpuidle.text section, and then checking the address of the interrupted
PC to see if it lies within that section.

This commit suitably tags x86 and tile idle routines, and only adds in
the minimal framework for other architectures.

Link: http://lkml.kernel.org/r/1472487169-14923-5-git-send-email-cmetcalf@mellanox.com
Signed-off-by: Chris Metcalf <cmetcalf@mellanox.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Thompson <daniel.thompson@linaro.org> [arm]
Tested-by: Petr Mladek <pmladek@suse.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/irqflags.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index b77f5edb03b0..ac7692dcfa2e 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -4,6 +4,10 @@
 #include <asm/processor-flags.h>
 
 #ifndef __ASSEMBLY__
+
+/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
+#define __cpuidle __attribute__((__section__(".cpuidle.text")))
+
 /*
  * Interrupt control:
  */
@@ -44,12 +48,12 @@ static inline void native_irq_enable(void)
 	asm volatile("sti": : :"memory");
 }
 
-static inline void native_safe_halt(void)
+static inline __cpuidle void native_safe_halt(void)
 {
 	asm volatile("sti; hlt": : :"memory");
 }
 
-static inline void native_halt(void)
+static inline __cpuidle void native_halt(void)
 {
 	asm volatile("hlt": : :"memory");
 }
@@ -86,7 +90,7 @@ static inline notrace void arch_local_irq_enable(void)
  * Used in the idle loop; sti takes one instruction cycle
  * to complete:
  */
-static inline void arch_safe_halt(void)
+static inline __cpuidle void arch_safe_halt(void)
 {
 	native_safe_halt();
 }
@@ -95,7 +99,7 @@ static inline void arch_safe_halt(void)
  * Used when interrupts are already enabled or to
  * shutdown the processor:
  */
-static inline void halt(void)
+static inline __cpuidle void halt(void)
 {
 	native_halt();
 }
-- 
cgit v1.2.3


From 0ee59413c967c35a6dd2dbdab605b4cd42025ee5 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Tue, 11 Oct 2016 13:54:23 -0700
Subject: x86/panic: replace smp_send_stop() with kdump friendly version in
 panic path

Daniel Walker reported problems which happens when
crash_kexec_post_notifiers kernel option is enabled
(https://lkml.org/lkml/2015/6/24/44).

In that case, smp_send_stop() is called before entering kdump routines
which assume other CPUs are still online.  As the result, for x86, kdump
routines fail to save other CPUs' registers and disable virtualization
extensions.

To fix this problem, call a new kdump friendly function,
crash_smp_send_stop(), instead of the smp_send_stop() when
crash_kexec_post_notifiers is enabled.  crash_smp_send_stop() is a weak
function, and it just call smp_send_stop().  Architecture codes should
override it so that kdump can work appropriately.  This patch only
provides x86-specific version.

For Xen's PV kernel, just keep the current behavior.

NOTES:

- Right solution would be to place crash_smp_send_stop() before
  __crash_kexec() invocation in all cases and remove smp_send_stop(), but
  we can't do that until all architectures implement own
  crash_smp_send_stop()

- crash_smp_send_stop()-like work is still needed by
  machine_crash_shutdown() because crash_kexec() can be called without
  entering panic()

Fixes: f06e5153f4ae (kernel/panic.c: add "crash_kexec_post_notifiers" option)
Link: http://lkml.kernel.org/r/20160810080948.11028.15344.stgit@sysi4-13.yrl.intra.hitachi.co.jp
Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Reported-by: Daniel Walker <dwalker@fifo99.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Daniel Walker <dwalker@fifo99.com>
Cc: Xunlei Pang <xpang@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Aaro Koskinen <aaro.koskinen@iki.fi>
Cc: "Steven J. Hill" <steven.hill@cavium.com>
Cc: Corey Minyard <cminyard@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/kexec.h | 1 +
 arch/x86/include/asm/smp.h   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index d2434c1cad05..282630e4c6ea 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -210,6 +210,7 @@ struct kexec_entry64_regs {
 
 typedef void crash_vmclear_fn(void);
 extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+extern void kdump_nmi_shootdown_cpus(void);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19980b36f394..026ea82ecc60 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -47,6 +47,7 @@ struct smp_ops {
 	void (*smp_cpus_done)(unsigned max_cpus);
 
 	void (*stop_other_cpus)(int wait);
+	void (*crash_stop_other_cpus)(void);
 	void (*smp_send_reschedule)(int cpu);
 
 	int (*cpu_up)(unsigned cpu, struct task_struct *tidle);
-- 
cgit v1.2.3