summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/sgx.h (renamed from arch/x86/kernel/cpu/sgx/arch.h)50
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c3
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c71
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/Makefile1
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c17
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c33
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h30
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c43
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c264
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h40
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c376
-rw-r--r--arch/x86/kvm/Kconfig12
16 files changed, 758 insertions, 188 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4f30ad647ec5..489531ed236e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1933,6 +1933,7 @@ config X86_SGX
depends on CRYPTO_SHA256=y
select SRCU
select MMU_NOTIFIER
+ select NUMA_KEEP_MEMINFO if NUMA
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index b440c950246d..5e2624a9d890 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -292,6 +292,8 @@
#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx.h
index dd7602c44c72..a16e2c9154a3 100644
--- a/arch/x86/kernel/cpu/sgx/arch.h
+++ b/arch/x86/include/asm/sgx.h
@@ -2,15 +2,20 @@
/**
* Copyright(c) 2016-20 Intel Corporation.
*
- * Contains data structures defined by the SGX architecture. Data structures
- * defined by the Linux software stack should not be placed here.
+ * Intel Software Guard Extensions (SGX) support.
*/
-#ifndef _ASM_X86_SGX_ARCH_H
-#define _ASM_X86_SGX_ARCH_H
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
#include <linux/bits.h>
#include <linux/types.h>
+/*
+ * This file contains both data structures defined by SGX architecture and Linux
+ * defined software data structures and functions. The two should not be mixed
+ * together for better readibility. The architectural definitions come first.
+ */
+
/* The SGX specific CPUID function. */
#define SGX_CPUID 0x12
/* EPC enumeration. */
@@ -22,16 +27,36 @@
/* The bitmask for the EPC section type. */
#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
+enum sgx_encls_function {
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+ EAUG = 0x0D,
+ EMODPR = 0x0E,
+ EMODT = 0x0F,
+};
+
/**
* enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
* %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
* been completed yet.
+ * %SGX_CHILD_PRESENT SECS has child pages present in the EPC.
* %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
* public key does not match IA32_SGXLEPUBKEYHASH.
* %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
*/
enum sgx_return_code {
SGX_NOT_TRACKED = 11,
+ SGX_CHILD_PRESENT = 13,
SGX_INVALID_EINITTOKEN = 16,
SGX_UNMASKED_EVENT = 128,
};
@@ -335,4 +360,19 @@ struct sgx_sigstruct {
#define SGX_LAUNCH_TOKEN_SIZE 304
-#endif /* _ASM_X86_SGX_ARCH_H */
+/*
+ * Do not put any hardware-defined SGX structure representations below this
+ * comment!
+ */
+
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd);
+
+#endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 42af31b64c2c..defda61f372d 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
{ X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
+ { X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX1, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
{}
};
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 3b1b01f2b248..da696eb4821a 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
}
#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
-static void clear_sgx_caps(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_SGX);
- setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
static int __init nosgx(char *str)
{
- clear_sgx_caps();
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
return 0;
}
@@ -110,23 +104,30 @@ early_param("nosgx", nosgx);
void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
{
+ bool enable_sgx_kvm = false, enable_sgx_driver = false;
bool tboot = tboot_enabled();
- bool enable_sgx;
+ bool enable_vmx;
u64 msr;
if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
- clear_sgx_caps();
+ clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
- /*
- * Enable SGX if and only if the kernel supports SGX and Launch Control
- * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
- */
- enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
- cpu_has(c, X86_FEATURE_SGX_LC) &&
- IS_ENABLED(CONFIG_X86_SGX);
+ enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+ IS_ENABLED(CONFIG_KVM_INTEL);
+
+ if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+ /*
+ * Separate out SGX driver enabling from KVM. This allows KVM
+ * guests to use SGX even if the kernel SGX driver refuses to
+ * use it. This happens if flexible Launch Control is not
+ * available.
+ */
+ enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+ enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+ }
if (msr & FEAT_CTL_LOCKED)
goto update_caps;
@@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
* i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
* for the kernel, e.g. using VMX to hide malicious code.
*/
- if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ if (enable_vmx) {
msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
if (tboot)
msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
}
- if (enable_sgx)
- msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+ if (enable_sgx_kvm || enable_sgx_driver) {
+ msr |= FEAT_CTL_SGX_ENABLED;
+ if (enable_sgx_driver)
+ msr |= FEAT_CTL_SGX_LC_ENABLED;
+ }
wrmsrl(MSR_IA32_FEAT_CTL, msr);
@@ -173,10 +177,29 @@ update_caps:
}
update_sgx:
- if (!(msr & FEAT_CTL_SGX_ENABLED) ||
- !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
- if (enable_sgx)
- pr_err_once("SGX disabled by BIOS\n");
- clear_sgx_caps();
+ if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+ if (enable_sgx_kvm || enable_sgx_driver)
+ pr_err_once("SGX disabled by BIOS.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ /*
+ * VMX feature bit may be cleared due to being disabled in BIOS,
+ * in which case SGX virtualization cannot be supported either.
+ */
+ if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+ pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+ enable_sgx_kvm = 0;
+ }
+
+ if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+ if (!enable_sgx_kvm) {
+ pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ } else {
+ pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+ }
}
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 972ec3bfa9c0..21d1f062895a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 91d3dc784a29..9c1656779b2a 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -3,3 +3,4 @@ obj-y += \
encl.o \
ioctl.o \
main.o
+obj-$(CONFIG_X86_SGX_KVM) += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 8ce6d8371cfb..aa9b8b868867 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = {
.get_unmapped_area = sgx_get_unmapped_area,
};
-const struct file_operations sgx_provision_fops = {
- .owner = THIS_MODULE,
-};
-
static struct miscdevice sgx_dev_enclave = {
.minor = MISC_DYNAMIC_MINOR,
.name = "sgx_enclave",
@@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = {
.fops = &sgx_encl_fops,
};
-static struct miscdevice sgx_dev_provision = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "sgx_provision",
- .nodename = "sgx_provision",
- .fops = &sgx_provision_fops,
-};
-
int __init sgx_drv_init(void)
{
unsigned int eax, ebx, ecx, edx;
@@ -187,11 +176,5 @@ int __init sgx_drv_init(void)
if (ret)
return ret;
- ret = misc_register(&sgx_dev_provision);
- if (ret) {
- misc_deregister(&sgx_dev_enclave);
- return ret;
- }
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 7449ef33f081..3be203297988 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -7,7 +7,7 @@
#include <linux/shmem_fs.h>
#include <linux/suspend.h>
#include <linux/sched/mm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#include "encl.h"
#include "encls.h"
#include "sgx.h"
@@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
if (ret) {
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(ret);
}
@@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref)
if (sgx_unmark_page_reclaimable(entry->epc_page))
continue;
- sgx_free_epc_page(entry->epc_page);
+ sgx_encl_free_epc_page(entry->epc_page);
encl->secs_child_cnt--;
entry->epc_page = NULL;
}
@@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref)
xa_destroy(&encl->page_array);
if (!encl->secs_child_cnt && encl->secs.epc_page) {
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
}
@@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref)
va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
list);
list_del(&va_page->list);
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
kfree(va_page);
}
@@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void)
ret = __epa(sgx_get_epc_virt_addr(epc_page));
if (ret) {
WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(-EFAULT);
}
@@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page)
return slot == SGX_VA_SLOT_COUNT;
}
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page: EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list. Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+ return;
+
+ sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index d8d30ccbef4c..6e74f85b6264 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void);
unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 443188fe7e70..9b204843b78d 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -11,21 +11,6 @@
#include <asm/traps.h>
#include "sgx.h"
-enum sgx_encls_function {
- ECREATE = 0x00,
- EADD = 0x01,
- EINIT = 0x02,
- EREMOVE = 0x03,
- EDGBRD = 0x04,
- EDGBWR = 0x05,
- EEXTEND = 0x06,
- ELDU = 0x08,
- EBLOCK = 0x09,
- EPA = 0x0A,
- EWB = 0x0B,
- ETRACK = 0x0C,
-};
-
/**
* ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
*
@@ -55,6 +40,19 @@ enum sgx_encls_function {
} while (0); \
}
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret: the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true: ENCLS leaf faulted.
+ * - false: Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+ return ret & ENCLS_FAULT_FLAG;
+}
+
/**
* encls_failed() - Check if an ENCLS function failed
* @ret: the return value of an ENCLS function call
@@ -65,7 +63,7 @@ enum sgx_encls_function {
*/
static inline bool encls_failed(int ret)
{
- if (ret & ENCLS_FAULT_FLAG)
+ if (encls_faulted(ret))
return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
return !!ret;
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 90a5caf76939..83df20e3e633 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -2,6 +2,7 @@
/* Copyright(c) 2016-20 Intel Corporation. */
#include <asm/mman.h>
+#include <asm/sgx.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
encl->page_cnt--;
if (va_page) {
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
list_del(&va_page->list);
kfree(va_page);
}
@@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
return 0;
err_out:
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
err_out_backing:
@@ -365,7 +366,7 @@ err_out_unlock:
mmap_read_unlock(current->mm);
err_out_free:
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
kfree(encl_page);
return ret;
@@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
void *token)
{
u64 mrsigner[4];
- int i, j, k;
+ int i, j;
void *addr;
int ret;
@@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
preempt_disable();
- for (k = 0; k < 4; k++)
- wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+ sgx_update_lepubkeyhash(mrsigner);
ret = __einit(sigstruct, token, addr);
@@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
}
}
- if (ret & ENCLS_FAULT_FLAG) {
+ if (encls_faulted(ret)) {
if (encls_failed(ret))
ENCLS_WARN(ret, "EINIT");
@@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
{
struct sgx_sigstruct *sigstruct;
struct sgx_enclave_init init_arg;
- struct page *initp_page;
void *token;
int ret;
@@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
return -EFAULT;
- initp_page = alloc_page(GFP_KERNEL);
- if (!initp_page)
+ /*
+ * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+ * boundary. kmalloc() will give this alignment when allocating
+ * PAGE_SIZE bytes.
+ */
+ sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sigstruct)
return -ENOMEM;
- sigstruct = kmap(initp_page);
token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
@@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
ret = sgx_encl_init(encl, sigstruct, token);
out:
- kunmap(initp_page);
- __free_page(initp_page);
+ kfree(sigstruct);
return ret;
}
@@ -665,24 +667,11 @@ out:
static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
{
struct sgx_enclave_provision params;
- struct file *file;
if (copy_from_user(&params, arg, sizeof(params)))
return -EFAULT;
- file = fget(params.fd);
- if (!file)
- return -EINVAL;
-
- if (file->f_op != &sgx_provision_fops) {
- fput(file);
- return -EINVAL;
- }
-
- encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
- fput(file);
- return 0;
+ return sgx_set_attribute(&encl->attributes_mask, params.fd);
}
long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8df81a3ed945..ad904747419e 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1,14 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2016-20 Intel Corporation. */
+#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/miscdevice.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
#include "encls.h"
@@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
* with sgx_reclaimer_lock acquired.
*/
static LIST_HEAD(sgx_active_page_list);
-
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+/* The free page list lock protected variables prepend the lock. */
+static unsigned long sgx_nr_free_pages;
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node. Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
/*
- * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
- * pages whose child pages blocked EREMOVE.
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
*/
-static void sgx_sanitize_section(struct sgx_epc_section *section)
+static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
struct sgx_epc_page *page;
LIST_HEAD(dirty);
int ret;
- /* init_laundry_list is thread-local, no need for a lock: */
- while (!list_empty(&section->init_laundry_list)) {
+ /* dirty_page_list is thread-local, no need for a lock: */
+ while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
return;
- /* needed for access to ->page_list: */
- spin_lock(&section->lock);
-
- page = list_first_entry(&section->init_laundry_list,
- struct sgx_epc_page, list);
+ page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
ret = __eremove(sgx_get_epc_virt_addr(page));
- if (!ret)
- list_move(&page->list, &section->page_list);
- else
+ if (!ret) {
+ /*
+ * page is now sanitized. Make it available via the SGX
+ * page allocator:
+ */
+ list_del(&page->list);
+ sgx_free_epc_page(page);
+ } else {
+ /* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
-
- spin_unlock(&section->lock);
+ }
cond_resched();
}
- list_splice(&dirty, &section->init_laundry_list);
+ list_splice(&dirty, dirty_page_list);
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
sgx_encl_put_backing(&secs_backing, true);
@@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void)
struct sgx_epc_section *section;
struct sgx_encl_page *encl_page;
struct sgx_epc_page *epc_page;
+ struct sgx_numa_node *node;
pgoff_t page_index;
int cnt = 0;
int ret;
@@ -379,50 +399,33 @@ skip:
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
section = &sgx_epc_sections[epc_page->section];
- spin_lock(&section->lock);
- list_add_tail(&epc_page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
- }
-}
-
-static unsigned long sgx_nr_free_pages(void)
-{
- unsigned long cnt = 0;
- int i;
-
- for (i = 0; i < sgx_nr_epc_sections; i++)
- cnt += sgx_epc_sections[i].free_cnt;
+ node = section->node;
- return cnt;
+ spin_lock(&node->lock);
+ list_add_tail(&epc_page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
+ spin_unlock(&node->lock);
+ }
}
static bool sgx_should_reclaim(unsigned long watermark)
{
- return sgx_nr_free_pages() < watermark &&
- !list_empty(&sgx_active_page_list);
+ return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
}
static int ksgxd(void *p)
{
- int i;
-
set_freezable();
/*
* Sanitize pages in order to recover from kexec(). The 2nd pass is
* required for SECS pages, whose child pages blocked EREMOVE.
*/
- for (i = 0; i < sgx_nr_epc_sections; i++)
- sgx_sanitize_section(&sgx_epc_sections[i]);
-
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- sgx_sanitize_section(&sgx_epc_sections[i]);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
- /* Should never happen. */
- if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
- WARN(1, "EPC section %d has unsanitized pages.\n", i);
- }
+ /* sanity check: */
+ WARN_ON(!list_empty(&sgx_dirty_page_list));
while (!kthread_should_stop()) {
if (try_to_freeze())
@@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void)
return true;
}
-static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
{
- struct sgx_epc_page *page;
+ struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+ struct sgx_epc_page *page = NULL;
- spin_lock(&section->lock);
+ spin_lock(&node->lock);
- if (list_empty(&section->page_list)) {
- spin_unlock(&section->lock);
+ if (list_empty(&node->free_page_list)) {
+ spin_unlock(&node->lock);
return NULL;
}
- page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
+ page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
- section->free_cnt--;
+ sgx_nr_free_pages--;
+
+ spin_unlock(&node->lock);
- spin_unlock(&section->lock);
return page;
}
/**
* __sgx_alloc_epc_page() - Allocate an EPC page
*
- * Iterate through EPC sections and borrow a free EPC page to the caller. When a
- * page is no longer needed it must be released with sgx_free_epc_page().
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
*
* Return:
- * an EPC page,
- * -errno on error
+ * - an EPC page: A borrowed EPC pages were available.
+ * - NULL: Out of EPC pages.
*/
struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
- struct sgx_epc_section *section;
struct sgx_epc_page *page;
- int i;
+ int nid_of_current = numa_node_id();
+ int nid = nid_of_current;
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- section = &sgx_epc_sections[i];
+ if (node_isset(nid_of_current, sgx_numa_mask)) {
+ page = __sgx_alloc_epc_page_from_node(nid_of_current);
+ if (page)
+ return page;
+ }
+
+ /* Fall back to the non-local NUMA nodes: */
+ while (true) {
+ nid = next_node_in(nid, sgx_numa_mask);
+ if (nid == nid_of_current)
+ break;
- page = __sgx_alloc_epc_page_from_section(section);
+ page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
}
@@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
* sgx_free_epc_page() - Free an EPC page
* @page: an EPC page
*
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
*/
void sgx_free_epc_page(struct sgx_epc_page *page)
{
struct sgx_epc_section *section = &sgx_epc_sections[page->section];
- int ret;
+ struct sgx_numa_node *node = section->node;
- WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+ spin_lock(&node->lock);
- ret = __eremove(sgx_get_epc_virt_addr(page));
- if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
- return;
+ list_add_tail(&page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
- spin_lock(&section->lock);
- list_add_tail(&page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
+ spin_unlock(&node->lock);
}
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
@@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
- spin_lock_init(&section->lock);
- INIT_LIST_HEAD(&section->page_list);
- INIT_LIST_HEAD(&section->init_laundry_list);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
- list_add_tail(&section->pages[i].list, &section->init_laundry_list);
+ list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
}
- section->free_cnt = nr_pages;
return true;
}
@@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void)
{
u32 eax, ebx, ecx, edx, type;
u64 pa, size;
+ int nid;
int i;
+ sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+ if (!sgx_numa_nodes)
+ return false;
+
for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
@@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void)
break;
}
+ nid = numa_map_to_online_node(phys_to_target_node(pa));
+ if (nid == NUMA_NO_NODE) {
+ /* The physical address is already printed above. */
+ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+ nid = 0;
+ }
+
+ if (!node_isset(nid, sgx_numa_mask)) {
+ spin_lock_init(&sgx_numa_nodes[nid].lock);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ node_set(nid, sgx_numa_mask);
+ }
+
+ sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+
sgx_nr_epc_sections++;
}
@@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void)
return true;
}
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+ int i;
+
+ WARN_ON_ONCE(preemptible());
+
+ for (i = 0; i < 4; i++)
+ wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes: Pointer to allowed enclave attributes
+ * @attribute_fd: File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL: Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd)
+{
+ struct file *file;
+
+ file = fget(attribute_fd);
+ if (!file)
+ return -EINVAL;
+
+ if (file->f_op != &sgx_provision_fops) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+ fput(file);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
static int __init sgx_init(void)
{
int ret;
@@ -716,12 +806,28 @@ static int __init sgx_init(void)
goto err_page_cache;
}
- ret = sgx_drv_init();
+ ret = misc_register(&sgx_dev_provision);
if (ret)
goto err_kthread;
+ /*
+ * Always try to initialize the native *and* KVM drivers.
+ * The KVM driver is less picky than the native one and
+ * can function if the native one is not supported on the
+ * current system or fails to initialize.
+ *
+ * Error out only if both fail to initialize.
+ */
+ ret = sgx_drv_init();
+
+ if (sgx_vepc_init() && ret)
+ goto err_provision;
+
return 0;
+err_provision:
+ misc_deregister(&sgx_dev_provision);
+
err_kthread:
kthread_stop(ksgxd_tsk);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 5fa42d143feb..4628acec0009 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -8,11 +8,15 @@
#include <linux/rwsem.h>
#include <linux/types.h>
#include <asm/asm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#undef pr_fmt
#define pr_fmt(fmt) "sgx: " fmt
+#define EREMOVE_ERROR_MESSAGE \
+ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+ "Refer to Documentation/x86/sgx.rst for more information."
+
#define SGX_MAX_EPC_SECTIONS 8
#define SGX_EEXTEND_BLOCK_SIZE 256
#define SGX_NR_TO_SCAN 16
@@ -30,28 +34,25 @@ struct sgx_epc_page {
};
/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+ struct list_head free_page_list;
+ spinlock_t lock;
+};
+
+/*
* The firmware can define multiple chunks of EPC to the different areas of the
* physical memory e.g. for memory areas of the each node. This structure is
* used to store EPC pages for one EPC section and virtual memory area where
* the pages have been mapped.
- *
- * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
*/
struct sgx_epc_section {
unsigned long phys_addr;
void *virt_addr;
struct sgx_epc_page *pages;
-
- spinlock_t lock;
- struct list_head page_list;
- unsigned long free_cnt;
-
- /*
- * Pages which need EREMOVE run on them before they can be
- * used. Only safe to be accessed in ksgxd and init code.
- * Not protected by locks.
- */
- struct list_head init_laundry_list;
+ struct sgx_numa_node *node;
};
extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
@@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..6ad165a5c0cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+ struct xarray page_array;
+ struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct sgx_epc_page *epc_page;
+ unsigned long index, pfn;
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&vepc->lock));
+
+ /* Calculate index of EPC page in virtual EPC's page_array */
+ index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+ epc_page = xa_load(&vepc->page_array, index);
+ if (epc_page)
+ return 0;
+
+ epc_page = sgx_alloc_epc_page(vepc, false);
+ if (IS_ERR(epc_page))
+ return PTR_ERR(epc_page);
+
+ ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+ if (ret)
+ goto err_free;
+
+ pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+ ret = vmf_insert_pfn(vma, addr, pfn);
+ if (ret != VM_FAULT_NOPAGE) {
+ ret = -EFAULT;
+ goto err_delete;
+ }
+
+ return 0;
+
+err_delete:
+ xa_erase(&vepc->page_array, index);
+err_free:
+ sgx_free_epc_page(epc_page);
+ return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_vepc *vepc = vma->vm_private_data;
+ int ret;
+
+ mutex_lock(&vepc->lock);
+ ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+ mutex_unlock(&vepc->lock);
+
+ if (!ret)
+ return VM_FAULT_NOPAGE;
+
+ if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ return VM_FAULT_RETRY;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+ .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &sgx_vepc_vm_ops;
+ /* Don't copy VMA in fork() */
+ vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+ vma->vm_private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret;
+
+ /*
+ * Take a previously guest-owned EPC page and return it to the
+ * general EPC page pool.
+ *
+ * Guests can not be trusted to have left this page in a good
+ * state, so run EREMOVE on the page unconditionally. In the
+ * case that a guest properly EREMOVE'd this page, a superfluous
+ * EREMOVE is harmless.
+ */
+ ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ /*
+ * Only SGX_CHILD_PRESENT is expected, which is because of
+ * EREMOVE'ing an SECS still with child, in which case it can
+ * be handled by EREMOVE'ing the SECS again after all pages in
+ * virtual EPC have been EREMOVE'd. See comments in below in
+ * sgx_vepc_release().
+ *
+ * The user of virtual EPC (KVM) needs to guarantee there's no
+ * logical processor is still running in the enclave in guest,
+ * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+ * handled here.
+ */
+ WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+ ret, ret);
+ return ret;
+ }
+
+ sgx_free_epc_page(epc_page);
+
+ return 0;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc = file->private_data;
+ struct sgx_epc_page *epc_page, *tmp, *entry;
+ unsigned long index;
+
+ LIST_HEAD(secs_pages);
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ /*
+ * Remove all normal, child pages. sgx_vepc_free_page()
+ * will fail if EREMOVE fails, but this is OK and expected on
+ * SECS pages. Those can only be EREMOVE'd *after* all their
+ * child pages. Retries below will clean them up.
+ */
+ if (sgx_vepc_free_page(entry))
+ continue;
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * Retry EREMOVE'ing pages. This will clean up any SECS pages that
+ * only had children in this 'epc' area.
+ */
+ xa_for_each(&vepc->page_array, index, entry) {
+ epc_page = entry;
+ /*
+ * An EREMOVE failure here means that the SECS page still
+ * has children. But, since all children in this 'sgx_vepc'
+ * have been removed, the SECS page must have a child on
+ * another instance.
+ */
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * SECS pages are "pinned" by child pages, and "unpinned" once all
+ * children have been EREMOVE'd. A child page in this instance
+ * may have pinned an SECS page encountered in an earlier release(),
+ * creating a zombie. Since some children were EREMOVE'd above,
+ * try to EREMOVE all zombies in the hopes that one was unpinned.
+ */
+ mutex_lock(&zombie_secs_pages_lock);
+ list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+ /*
+ * Speculatively remove the page from the list of zombies,
+ * if the page is successfully EREMOVE'd it will be added to
+ * the list of free pages. If EREMOVE fails, throw the page
+ * on the local list, which will be spliced on at the end.
+ */
+ list_del(&epc_page->list);
+
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+ }
+
+ if (!list_empty(&secs_pages))
+ list_splice_tail(&secs_pages, &zombie_secs_pages);
+ mutex_unlock(&zombie_secs_pages_lock);
+
+ kfree(vepc);
+
+ return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc;
+
+ vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+ if (!vepc)
+ return -ENOMEM;
+ mutex_init(&vepc->lock);
+ xa_init(&vepc->page_array);
+
+ file->private_data = vepc;
+
+ return 0;
+}
+
+static const struct file_operations sgx_vepc_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_vepc_open,
+ .release = sgx_vepc_release,
+ .mmap = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_vepc",
+ .nodename = "sgx_vepc",
+ .fops = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+ /* SGX virtualization requires KVM to work */
+ if (!cpu_feature_enabled(X86_FEATURE_VMX))
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&zombie_secs_pages);
+ mutex_init(&zombie_secs_pages_lock);
+
+ return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo: Pointer to PAGEINFO structure
+ * @secs: Userspace pointer to SECS page
+ * @trapnr: trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * - 0: ECREATE was successful.
+ * - <0: on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr)
+{
+ int ret;
+
+ /*
+ * @secs is an untrusted, userspace-provided address. It comes from
+ * KVM and is assumed to be a valid pointer which points somewhere in
+ * userspace. This can fault and call SGX or other fault handlers when
+ * userspace mapping @secs doesn't exist.
+ *
+ * Add a WARN() to make sure @secs is already valid userspace pointer
+ * from caller (KVM), who should already have handled invalid pointer
+ * case (for instance, made by malicious guest). All other checks,
+ * such as alignment of @secs, are deferred to ENCLS itself.
+ */
+ if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __ecreate(pageinfo, (void *)secs);
+ __uaccess_end();
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ /* ECREATE doesn't return an error code, it faults or succeeds. */
+ WARN_ON_ONCE(ret);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs)
+{
+ int ret;
+
+ /*
+ * Make sure all userspace pointers from caller (KVM) are valid.
+ * All other checks deferred to ENCLS itself. Also see comment
+ * for @secs in sgx_virt_ecreate().
+ */
+#define SGX_EINITTOKEN_SIZE 304
+ if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+ !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+ !access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+ __uaccess_end();
+
+ return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct: Userspace pointer to SIGSTRUCT structure
+ * @token: Userspace pointer to EINITTOKEN structure
+ * @secs: Userspace pointer to SECS page
+ * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr: trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * - 0: EINIT was successful.
+ * - <0: on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ } else {
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(lepubkeyhash);
+
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ preempt_enable();
+ }
+
+ /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+ if (ret == -EINVAL)
+ return ret;
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a788d5120d4d..f6b93a35ce14 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,6 +84,18 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config X86_SGX_KVM
+ bool "Software Guard eXtensions (SGX) Virtualization"
+ depends on X86_SGX && KVM_INTEL
+ help
+
+ Enables KVM guests to create SGX enclaves.
+
+ This includes support to expose "raw" unreclaimable enclave memory to
+ guests via a device node, e.g. /dev/sgx_vepc.
+
+ If unsure, say N.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM