summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorBorislav Petkov <bp@suse.de>2020-12-01 18:01:27 +0100
committerBorislav Petkov <bp@suse.de>2020-12-01 18:01:27 +0100
commit87314fb181f9042a226d721ab4a5579ddfca139c (patch)
treeb663e58cfe19ecf5003d4ea6e8048f6ae1925e9f /arch/x86/kernel
parent2002d2951398317d0f46e64ae6d8dd58ed541c6d (diff)
parentb65054597872ce3aefbc6a666385eabdf9e288da (diff)
Merge tag 'v5.10-rc6' into x86/cache
Merge -rc6 tag to pick up dependent changes. Signed-off-by: Borislav Petkov <bp@suse.de>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c29
-rw-r--r--arch/x86/kernel/cpu/bugs.c55
-rw-r--r--arch/x86/kernel/cpu/mce/core.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c63
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c65
-rw-r--r--arch/x86/kernel/dumpstack.c23
-rw-r--r--arch/x86/kernel/head_64.S16
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c3
-rw-r--r--arch/x86/kernel/perf_regs.c15
-rw-r--r--arch/x86/kernel/process.c12
-rw-r--r--arch/x86/kernel/sev-es-shared.c26
-rw-r--r--arch/x86/kernel/sev-es.c20
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S89
-rw-r--r--arch/x86/kernel/tboot.c8
-rw-r--r--arch/x86/kernel/traps.c43
-rw-r--r--arch/x86/kernel/unwind_orc.c9
17 files changed, 319 insertions, 172 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4adbe65afe23..2400ad62f330 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -807,6 +807,15 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
temp_mm_state_t temp_state;
lockdep_assert_irqs_disabled();
+
+ /*
+ * Make sure not to be in TLB lazy mode, as otherwise we'll end up
+ * with a stale address space WITHOUT being in lazy mode after
+ * restoring the previous mm.
+ */
+ if (this_cpu_read(cpu_tlbstate.is_lazy))
+ leave_mm(smp_processor_id());
+
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 714233cee0b5..1b98f8c12b96 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -33,7 +33,7 @@ static union uvh_apicid uvh_apicid;
static int uv_node_id;
/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */
-static u8 uv_archtype[UV_AT_SIZE];
+static u8 uv_archtype[UV_AT_SIZE + 1];
static u8 oem_id[ACPI_OEM_ID_SIZE + 1];
static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
@@ -290,6 +290,9 @@ static void __init uv_stringify(int len, char *to, char *from)
{
/* Relies on 'to' being NULL chars so result will be NULL terminated */
strncpy(to, from, len-1);
+
+ /* Trim trailing spaces */
+ (void)strim(to);
}
/* Find UV arch type entry in UVsystab */
@@ -317,7 +320,7 @@ static int __init decode_arch_type(unsigned long ptr)
if (n > 0 && n < sizeof(uv_ate->archtype)) {
pr_info("UV: UVarchtype received from BIOS\n");
- uv_stringify(UV_AT_SIZE, uv_archtype, uv_ate->archtype);
+ uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype);
return 1;
}
return 0;
@@ -366,7 +369,7 @@ static int __init early_get_arch_type(void)
return ret;
}
-static int __init uv_set_system_type(char *_oem_id)
+static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
{
/* Save OEM_ID passed from ACPI MADT */
uv_stringify(sizeof(oem_id), oem_id, _oem_id);
@@ -375,7 +378,7 @@ static int __init uv_set_system_type(char *_oem_id)
if (!early_get_arch_type())
/* If not use OEM ID for UVarchtype */
- uv_stringify(UV_AT_SIZE, uv_archtype, _oem_id);
+ uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id);
/* Check if not hubbed */
if (strncmp(uv_archtype, "SGI", 3) != 0) {
@@ -386,13 +389,23 @@ static int __init uv_set_system_type(char *_oem_id)
/* (Not hubless), not a UV */
return 0;
+ /* Is UV hubless system */
+ uv_hubless_system = 0x01;
+
+ /* UV5 Hubless */
+ if (strncmp(uv_archtype, "NSGI5", 5) == 0)
+ uv_hubless_system |= 0x20;
+
/* UV4 Hubless: CH */
- if (strncmp(uv_archtype, "NSGI4", 5) == 0)
- uv_hubless_system = 0x11;
+ else if (strncmp(uv_archtype, "NSGI4", 5) == 0)
+ uv_hubless_system |= 0x10;
/* UV3 Hubless: UV300/MC990X w/o hub */
else
- uv_hubless_system = 0x9;
+ uv_hubless_system |= 0x8;
+
+ /* Copy APIC type */
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n",
oem_id, oem_table_id, uv_system_type, uv_hubless_system);
@@ -456,7 +469,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
/* If not UV, return. */
- if (likely(uv_set_system_type(_oem_id) == 0))
+ if (uv_set_system_type(_oem_id, _oem_table_id) == 0)
return 0;
/* Save and Decode OEM Table ID */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d3f0db463f96..d41b70fe4918 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -739,11 +739,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+ spectre_v2_user_ibpb = mode;
switch (cmd) {
case SPECTRE_V2_USER_CMD_FORCE:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
static_branch_enable(&switch_mm_always_ibpb);
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_PRCTL:
case SPECTRE_V2_USER_CMD_AUTO:
@@ -757,8 +759,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
static_key_enabled(&switch_mm_always_ibpb) ?
"always-on" : "conditional");
-
- spectre_v2_user_ibpb = mode;
}
/*
@@ -1254,6 +1254,14 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
return 0;
}
+static bool is_spec_ib_user_controlled(void)
+{
+ return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP;
+}
+
static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
{
switch (ctrl) {
@@ -1261,16 +1269,26 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return 0;
+
/*
- * Indirect branch speculation is always disabled in strict
- * mode. It can neither be enabled if it was force-disabled
- * by a previous prctl call.
+ * With strict mode for both IBPB and STIBP, the instruction
+ * code paths avoid checking this task flag and instead,
+ * unconditionally run the instruction. However, STIBP and IBPB
+ * are independent and either can be set to conditionally
+ * enabled regardless of the mode of the other.
+ *
+ * If either is set to conditional, allow the task flag to be
+ * updated, unless it was force-disabled by a previous prctl
+ * call. Currently, this is possible on an AMD CPU which has the
+ * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the
+ * kernel is booted with 'spectre_v2_user=seccomp', then
+ * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and
+ * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED.
*/
- if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+ if (!is_spec_ib_user_controlled() ||
task_spec_ib_force_disable(task))
return -EPERM;
+
task_clear_spec_ib_disable(task);
task_update_spec_tif(task);
break;
@@ -1283,10 +1301,10 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return -EPERM;
- if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+
+ if (!is_spec_ib_user_controlled())
return 0;
+
task_set_spec_ib_disable(task);
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
@@ -1351,20 +1369,17 @@ static int ib_prctl_get(struct task_struct *task)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return PR_SPEC_ENABLE;
- else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
- return PR_SPEC_DISABLE;
- else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
- spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) {
+ else if (is_spec_ib_user_controlled()) {
if (task_spec_ib_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ib_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- } else
+ } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+ return PR_SPEC_DISABLE;
+ else
return PR_SPEC_NOT_AFFECTED;
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4102b866e7c0..32b7099e3511 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1384,8 +1384,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
* When there's any problem use only local no_way_out state.
*/
if (!lmce) {
- if (mce_end(order) < 0)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
+ if (mce_end(order) < 0) {
+ if (!no_way_out)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+ }
} else {
/*
* If there was a fatal machine check we should have
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 6a99535d7f37..7e8e07bddd5f 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -100,53 +100,6 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev
return find_matching_signature(mc, csig, cpf);
}
-/*
- * Given CPU signature and a microcode patch, this function finds if the
- * microcode patch has matching family and model with the CPU.
- *
- * %true - if there's a match
- * %false - otherwise
- */
-static bool microcode_matches(struct microcode_header_intel *mc_header,
- unsigned long sig)
-{
- unsigned long total_size = get_totalsize(mc_header);
- unsigned long data_size = get_datasize(mc_header);
- struct extended_sigtable *ext_header;
- unsigned int fam_ucode, model_ucode;
- struct extended_signature *ext_sig;
- unsigned int fam, model;
- int ext_sigcount, i;
-
- fam = x86_family(sig);
- model = x86_model(sig);
-
- fam_ucode = x86_family(mc_header->sig);
- model_ucode = x86_model(mc_header->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return true;
-
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- return false;
-
- ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
- ext_sigcount = ext_header->count;
-
- for (i = 0; i < ext_sigcount; i++) {
- fam_ucode = x86_family(ext_sig->sig);
- model_ucode = x86_model(ext_sig->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return true;
-
- ext_sig++;
- }
- return false;
-}
-
static struct ucode_patch *memdup_patch(void *data, unsigned int size)
{
struct ucode_patch *p;
@@ -164,7 +117,7 @@ static struct ucode_patch *memdup_patch(void *data, unsigned int size)
return p;
}
-static void save_microcode_patch(void *data, unsigned int size)
+static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size)
{
struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
struct ucode_patch *iter, *tmp, *p = NULL;
@@ -210,6 +163,9 @@ static void save_microcode_patch(void *data, unsigned int size)
if (!p)
return;
+ if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
+ return;
+
/*
* Save for early loading. On 32-bit, that needs to be a physical
* address as the APs are running from physical addresses, before
@@ -344,13 +300,14 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
size -= mc_size;
- if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+ if (!find_matching_signature(data, uci->cpu_sig.sig,
+ uci->cpu_sig.pf)) {
data += mc_size;
continue;
}
if (save) {
- save_microcode_patch(data, mc_size);
+ save_microcode_patch(uci, data, mc_size);
goto next;
}
@@ -483,14 +440,14 @@ static void show_saved_mc(void)
* Save this microcode patch. It will be loaded early when a CPU is
* hot-added or resumes.
*/
-static void save_mc_for_early(u8 *mc, unsigned int size)
+static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size)
{
/* Synchronization during CPU hotplug. */
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
mutex_lock(&x86_cpu_microcode_mutex);
- save_microcode_patch(mc, size);
+ save_microcode_patch(uci, mc, size);
show_saved_mc();
mutex_unlock(&x86_cpu_microcode_mutex);
@@ -935,7 +892,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
* permanent memory. So it will be loaded early when a CPU is hot added
* or resumes.
*/
- save_mc_for_early(new_mc, new_mc_size);
+ save_mc_for_early(uci, new_mc, new_mc_size);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 78dcbb8e0172..05a026d4420b 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -507,6 +507,24 @@ unlock:
return ret ?: nbytes;
}
+/**
+ * rdtgroup_remove - the helper to remove resource group safely
+ * @rdtgrp: resource group to remove
+ *
+ * On resource group creation via a mkdir, an extra kernfs_node reference is
+ * taken to ensure that the rdtgroup structure remains accessible for the
+ * rdtgroup_kn_unlock() calls where it is removed.
+ *
+ * Drop the extra reference here, then free the rdtgroup structure.
+ *
+ * Return: void
+ */
+static void rdtgroup_remove(struct rdtgroup *rdtgrp)
+{
+ kernfs_put(rdtgrp->kn);
+ kfree(rdtgrp);
+}
+
struct task_move_callback {
struct callback_head work;
struct rdtgroup *rdtgrp;
@@ -529,7 +547,7 @@ static void move_myself(struct callback_head *head)
(rdtgrp->flags & RDT_DELETED)) {
current->closid = 0;
current->rmid = 0;
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
}
if (unlikely(current->flags & PF_EXITING))
@@ -1769,7 +1787,6 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
if (IS_ERR(kn_subdir))
return PTR_ERR(kn_subdir);
- kernfs_get(kn_subdir);
ret = rdtgroup_kn_set_ugid(kn_subdir);
if (ret)
return ret;
@@ -1792,7 +1809,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
if (IS_ERR(kn_info))
return PTR_ERR(kn_info);
- kernfs_get(kn_info);
ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
if (ret)
@@ -1813,12 +1829,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
goto out_destroy;
}
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that @rdtgrp->kn is always accessible.
- */
- kernfs_get(kn_info);
-
ret = rdtgroup_kn_set_ugid(kn_info);
if (ret)
goto out_destroy;
@@ -1847,12 +1857,6 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
if (dest_kn)
*dest_kn = kn;
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that @rdtgrp->kn is always accessible.
- */
- kernfs_get(kn);
-
ret = rdtgroup_kn_set_ugid(kn);
if (ret)
goto out_destroy;
@@ -2079,8 +2083,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
rdtgroup_pseudo_lock_remove(rdtgrp);
kernfs_unbreak_active_protection(kn);
- kernfs_put(rdtgrp->kn);
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
} else {
kernfs_unbreak_active_protection(kn);
}
@@ -2139,13 +2142,11 @@ static int rdt_get_tree(struct fs_context *fc)
&kn_mongrp);
if (ret < 0)
goto out_info;
- kernfs_get(kn_mongrp);
ret = mkdir_mondata_all(rdtgroup_default.kn,
&rdtgroup_default, &kn_mondata);
if (ret < 0)
goto out_mongrp;
- kernfs_get(kn_mondata);
rdtgroup_default.mon.mon_data_kn = kn_mondata;
}
@@ -2357,7 +2358,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
if (atomic_read(&sentry->waitcount) != 0)
sentry->flags = RDT_DELETED;
else
- kfree(sentry);
+ rdtgroup_remove(sentry);
}
}
@@ -2399,7 +2400,7 @@ static void rmdir_all_sub(void)
if (atomic_read(&rdtgrp->waitcount) != 0)
rdtgrp->flags = RDT_DELETED;
else
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
update_closid_rmid(cpu_online_mask, &rdtgroup_default);
@@ -2499,11 +2500,6 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
if (IS_ERR(kn))
return PTR_ERR(kn);
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that kn is always accessible.
- */
- kernfs_get(kn);
ret = rdtgroup_kn_set_ugid(kn);
if (ret)
goto out_destroy;
@@ -2838,8 +2834,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
/*
* kernfs_remove() will drop the reference count on "kn" which
* will free it. But we still need it to stick around for the
- * rdtgroup_kn_unlock(kn} call below. Take one extra reference
- * here, which will be dropped inside rdtgroup_kn_unlock().
+ * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
+ * which will be dropped by kernfs_put() in rdtgroup_remove().
*/
kernfs_get(kn);
@@ -2880,6 +2876,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
out_idfree:
free_rmid(rdtgrp->mon.rmid);
out_destroy:
+ kernfs_put(rdtgrp->kn);
kernfs_remove(rdtgrp->kn);
out_free_rgrp:
kfree(rdtgrp);
@@ -2892,7 +2889,7 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
{
kernfs_remove(rgrp->kn);
free_rmid(rgrp->mon.rmid);
- kfree(rgrp);
+ rdtgroup_remove(rgrp);
}
/*
@@ -3049,11 +3046,6 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
list_del(&rdtgrp->mon.crdtgrp_list);
- /*
- * one extra hold on this, will drop when we kfree(rdtgrp)
- * in rdtgroup_kn_unlock()
- */
- kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
return 0;
@@ -3065,11 +3057,6 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
rdtgrp->flags = RDT_DELETED;
list_del(&rdtgrp->rdtgroup_list);
- /*
- * one extra hold on this, will drop when we kfree(rdtgrp)
- * in rdtgroup_kn_unlock()
- */
- kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
return 0;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 25c06b67e7e0..97aa900386cb 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -78,6 +78,9 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
if (!user_mode(regs))
return copy_from_kernel_nofault(buf, (u8 *)src, nbytes);
+ /* The user space code from other tasks cannot be accessed. */
+ if (regs != task_pt_regs(current))
+ return -EPERM;
/*
* Make sure userspace isn't trying to trick us into dumping kernel
* memory by pointing the userspace instruction pointer at it.
@@ -85,6 +88,12 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX))
return -EINVAL;
+ /*
+ * Even if named copy_from_user_nmi() this can be invoked from
+ * other contexts and will not try to resolve a pagefault, which is
+ * the correct thing to do here as this code can be called from any
+ * context.
+ */
return copy_from_user_nmi(buf, (void __user *)src, nbytes);
}
@@ -115,13 +124,19 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl)
u8 opcodes[OPCODE_BUFSIZE];
unsigned long prologue = regs->ip - PROLOGUE_SIZE;
- if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
- printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n",
- loglvl, prologue);
- } else {
+ switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
+ case 0:
printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
__stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,
opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1);
+ break;
+ case -EPERM:
+ /* No access to the user space stack of other tasks. Ignore. */
+ break;
+ default:
+ printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n",
+ loglvl, prologue);
+ break;
}
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7eb2a1c87969..3c417734790f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -161,6 +161,21 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Setup early boot stage 4-/5-level pagetables. */
addq phys_base(%rip), %rax
+
+ /*
+ * For SEV guests: Verify that the C-bit is correct. A malicious
+ * hypervisor could lie about the C-bit position to perform a ROP
+ * attack on the guest by writing to the unencrypted stack and wait for
+ * the next RET instruction.
+ * %rsi carries pointer to realmode data and is callee-clobbered. Save
+ * and restore it.
+ */
+ pushq %rsi
+ movq %rax, %rdi
+ call sev_verify_cbit
+ popq %rsi
+
+ /* Switch to new page-table */
movq %rax, %cr3
/* Ensure I am executing from virtual addresses */
@@ -279,6 +294,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
+#include "sev_verify_cbit.S"
#ifdef CONFIG_HOTPLUG_CPU
/*
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 57c2ecf43134..ce831f9448e7 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -200,8 +200,7 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
/* Copying screen_info will do? */
- memcpy(&params->screen_info, &boot_params.screen_info,
- sizeof(struct screen_info));
+ memcpy(&params->screen_info, &screen_info, sizeof(struct screen_info));
/* Fill in memsize later */
params->screen_info.ext_mem_k = 0;
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index bb7e1132290b..f9e5352b3bef 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task)
}
void perf_get_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
regs_user->regs = task_pt_regs(current);
regs_user->abi = perf_reg_abi(current);
@@ -129,12 +128,20 @@ u64 perf_reg_abi(struct task_struct *task)
return PERF_SAMPLE_REGS_ABI_64;
}
+static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs);
+
void perf_get_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
+ struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs);
struct pt_regs *user_regs = task_pt_regs(current);
+ if (!in_nmi()) {
+ regs_user->regs = user_regs;
+ regs_user->abi = perf_reg_abi(current);
+ return;
+ }
+
/*
* If we're in an NMI that interrupted task_pt_regs setup, then
* we can't sample user regs at all. This check isn't really
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba4593a913fa..145a7ac0c19a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -685,7 +685,7 @@ void arch_cpu_idle(void)
*/
void __cpuidle default_idle(void)
{
- safe_halt();
+ raw_safe_halt();
}
#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
EXPORT_SYMBOL(default_idle);
@@ -736,6 +736,8 @@ void stop_this_cpu(void *dummy)
/*
* AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
* states (local apic timer and TSC stop).
+ *
+ * XXX this function is completely buggered vs RCU and tracing.
*/
static void amd_e400_idle(void)
{
@@ -757,9 +759,9 @@ static void amd_e400_idle(void)
* The switch back from broadcast mode needs to be called with
* interrupts disabled.
*/
- local_irq_disable();
+ raw_local_irq_disable();
tick_broadcast_exit();
- local_irq_enable();
+ raw_local_irq_enable();
}
/*
@@ -801,9 +803,9 @@ static __cpuidle void mwait_idle(void)
if (!need_resched())
__sti_mwait(0, 0);
else
- local_irq_enable();
+ raw_local_irq_enable();
} else {
- local_irq_enable();
+ raw_local_irq_enable();
}
__current_clr_polling();
}
diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c
index 5f83ccaab877..7d04b356d44d 100644
--- a/arch/x86/kernel/sev-es-shared.c
+++ b/arch/x86/kernel/sev-es-shared.c
@@ -178,6 +178,32 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
goto fail;
regs->dx = val >> 32;
+ /*
+ * This is a VC handler and the #VC is only raised when SEV-ES is
+ * active, which means SEV must be active too. Do sanity checks on the
+ * CPUID results to make sure the hypervisor does not trick the kernel
+ * into the no-sev path. This could map sensitive data unencrypted and
+ * make it accessible to the hypervisor.
+ *
+ * In particular, check for:
+ * - Hypervisor CPUID bit
+ * - Availability of CPUID leaf 0x8000001f
+ * - SEV CPUID bit.
+ *
+ * The hypervisor might still report the wrong C-bit position, but this
+ * can't be checked here.
+ */
+
+ if ((fn == 1 && !(regs->cx & BIT(31))))
+ /* Hypervisor bit */
+ goto fail;
+ else if (fn == 0x80000000 && (regs->ax < 0x8000001f))
+ /* SEV leaf check */
+ goto fail;
+ else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
+ /* SEV bit */
+ goto fail;
+
/* Skip over the CPUID two-byte opcode */
regs->ip += 2;
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 4a96726fbaf8..0bd1a0fc587e 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -374,8 +374,8 @@ fault:
return ES_EXCEPTION;
}
-static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- unsigned long vaddr, phys_addr_t *paddr)
+static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned long vaddr, phys_addr_t *paddr)
{
unsigned long va = (unsigned long)vaddr;
unsigned int level;
@@ -394,15 +394,19 @@ static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
if (user_mode(ctxt->regs))
ctxt->fi.error_code |= X86_PF_USER;
- return false;
+ return ES_EXCEPTION;
}
+ if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
+ /* Emulated MMIO to/from encrypted memory not supported */
+ return ES_UNSUPPORTED;
+
pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
pa |= va & ~page_level_mask(level);
*paddr = pa;
- return true;
+ return ES_OK;
}
/* Include code shared with pre-decompression boot stage */
@@ -731,6 +735,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
{
u64 exit_code, exit_info_1, exit_info_2;
unsigned long ghcb_pa = __pa(ghcb);
+ enum es_result res;
phys_addr_t paddr;
void __user *ref;
@@ -740,11 +745,12 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
- if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) {
- if (!read)
+ res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
+ if (res != ES_OK) {
+ if (res == ES_EXCEPTION && !read)
ctxt->fi.error_code |= X86_PF_WRITE;
- return ES_EXCEPTION;
+ return res;
}
exit_info_1 = paddr;
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
new file mode 100644
index 000000000000..ee04941a6546
--- /dev/null
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sev_verify_cbit.S - Code for verification of the C-bit position reported
+ * by the Hypervisor when running with SEV enabled.
+ *
+ * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de)
+ *
+ * sev_verify_cbit() is called before switching to a new long-mode page-table
+ * at boot.
+ *
+ * Verify that the C-bit position is correct by writing a random value to
+ * an encrypted memory location while on the current page-table. Then it
+ * switches to the new page-table to verify the memory content is still the
+ * same. After that it switches back to the current page-table and when the
+ * check succeeded it returns. If the check failed the code invalidates the
+ * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to
+ * make sure no interrupt or exception can get the CPU out of the hlt loop.
+ *
+ * New page-table pointer is expected in %rdi (first parameter)
+ *
+ */
+SYM_FUNC_START(sev_verify_cbit)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* First check if a C-bit was detected */
+ movq sme_me_mask(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */
+ movq sev_status(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* Save CR4 in %rsi */
+ movq %cr4, %rsi
+
+ /* Disable Global Pages */
+ movq %rsi, %rdx
+ andq $(~X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /*
+ * Verified that running under SEV - now get a random value using
+ * RDRAND. This instruction is mandatory when running as an SEV guest.
+ *
+ * Don't bail out of the loop if RDRAND returns errors. It is better to
+ * prevent forward progress than to work with a non-random value here.
+ */
+1: rdrand %rdx
+ jnc 1b
+
+ /* Store value to memory and keep it in %rdx */
+ movq %rdx, sev_check_data(%rip)
+
+ /* Backup current %cr3 value to restore it later */
+ movq %cr3, %rcx
+
+ /* Switch to new %cr3 - This might unmap the stack */
+ movq %rdi, %cr3
+
+ /*
+ * Compare value in %rdx with memory location. If C-bit is incorrect
+ * this would read the encrypted data and make the check fail.
+ */
+ cmpq %rdx, sev_check_data(%rip)
+
+ /* Restore old %cr3 */
+ movq %rcx, %cr3
+
+ /* Restore previous CR4 */
+ movq %rsi, %cr4
+
+ /* Check CMPQ result */
+ je 3f
+
+ /*
+ * The check failed, prevent any forward progress to prevent ROP
+ * attacks, invalidate the stack and go into a hlt loop.
+ */
+ xorq %rsp, %rsp
+ subq $0x1000, %rsp
+2: hlt
+ jmp 2b
+3:
+#endif
+ /* Return page-table pointer */
+ movq %rdi, %rax
+ ret
+SYM_FUNC_END(sev_verify_cbit)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 992fb1415c0f..ae64f98ec2ab 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -514,16 +514,10 @@ int tboot_force_iommu(void)
if (!tboot_enabled())
return 0;
- if (intel_iommu_tboot_noforce)
- return 1;
-
- if (no_iommu || swiotlb || dmar_disabled)
+ if (no_iommu || dmar_disabled)
pr_warn("Forcing Intel-IOMMU to enabled\n");
dmar_disabled = 0;
-#ifdef CONFIG_SWIOTLB
- swiotlb = 0;
-#endif
no_iommu = 0;
return 1;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3c70fb34028b..e19df6cde35d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -793,19 +793,6 @@ static __always_inline unsigned long debug_read_clear_dr6(void)
set_debugreg(DR6_RESERVED, 6);
dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
- /*
- * Clear the virtual DR6 value, ptrace routines will set bits here for
- * things we want signals for.
- */
- current->thread.virtual_dr6 = 0;
-
- /*
- * The SDM says "The processor clears the BTF flag when it
- * generates a debug exception." Clear TIF_BLOCKSTEP to keep
- * TIF_BLOCKSTEP in sync with the hardware BTF flag.
- */
- clear_thread_flag(TIF_BLOCKSTEP);
-
return dr6;
}
@@ -873,6 +860,20 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
*/
WARN_ON_ONCE(user_mode(regs));
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." but PTRACE_BLOCKSTEP requested
+ * it for userspace, but we just took a kernel #DB, so re-set
+ * BTF.
+ */
+ unsigned long debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl |= DEBUGCTLMSR_BTF;
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ }
+
/*
* Catch SYSENTER with TF set and clear DR_STEP. If this hit a
* watchpoint at the same time then that will still be handled.
@@ -936,6 +937,22 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
instrumentation_begin();
/*
+ * Start the virtual/ptrace DR6 value with just the DR_STEP mask
+ * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits.
+ *
+ * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6)
+ * even if it is not the result of PTRACE_SINGLESTEP.
+ */
+ current->thread.virtual_dr6 = (dr6 & DR_STEP);
+
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." Clear TIF_BLOCKSTEP to keep
+ * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+ */
+ clear_thread_flag(TIF_BLOCKSTEP);
+
+ /*
* If dr6 has no reason to give us about the origin of this trap,
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 6a339ce328e0..73f800100066 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -321,19 +321,12 @@ EXPORT_SYMBOL_GPL(unwind_get_return_address);
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
{
- struct task_struct *task = state->task;
-
if (unwind_done(state))
return NULL;
if (state->regs)
return &state->regs->ip;
- if (task != current && state->sp == task->thread.sp) {
- struct inactive_task_frame *frame = (void *)task->thread.sp;
- return &frame->ret_addr;
- }
-
if (state->sp)
return (unsigned long *)state->sp - 1;
@@ -663,7 +656,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
} else {
struct inactive_task_frame *frame = (void *)task->thread.sp;
- state->sp = task->thread.sp;
+ state->sp = task->thread.sp + sizeof(*frame);
state->bp = READ_ONCE_NOCHECK(frame->bp);
state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
state->signal = (void *)state->ip == ret_from_fork;