diff options
author | Borislav Petkov <bp@suse.de> | 2020-12-01 18:01:27 +0100 |
---|---|---|
committer | Borislav Petkov <bp@suse.de> | 2020-12-01 18:01:27 +0100 |
commit | 87314fb181f9042a226d721ab4a5579ddfca139c (patch) | |
tree | b663e58cfe19ecf5003d4ea6e8048f6ae1925e9f /arch/x86/kernel | |
parent | 2002d2951398317d0f46e64ae6d8dd58ed541c6d (diff) | |
parent | b65054597872ce3aefbc6a666385eabdf9e288da (diff) |
Merge tag 'v5.10-rc6' into x86/cache
Merge -rc6 tag to pick up dependent changes.
Signed-off-by: Borislav Petkov <bp@suse.de>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/alternative.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/apic/x2apic_uv_x.c | 29 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/bugs.c | 55 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/microcode/intel.c | 63 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/resctrl/rdtgroup.c | 65 | ||||
-rw-r--r-- | arch/x86/kernel/dumpstack.c | 23 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 16 | ||||
-rw-r--r-- | arch/x86/kernel/kexec-bzimage64.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/perf_regs.c | 15 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 12 | ||||
-rw-r--r-- | arch/x86/kernel/sev-es-shared.c | 26 | ||||
-rw-r--r-- | arch/x86/kernel/sev-es.c | 20 | ||||
-rw-r--r-- | arch/x86/kernel/sev_verify_cbit.S | 89 | ||||
-rw-r--r-- | arch/x86/kernel/tboot.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 43 | ||||
-rw-r--r-- | arch/x86/kernel/unwind_orc.c | 9 |
17 files changed, 319 insertions, 172 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4adbe65afe23..2400ad62f330 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -807,6 +807,15 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) temp_mm_state_t temp_state; lockdep_assert_irqs_disabled(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate.is_lazy)) + leave_mm(smp_processor_id()); + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); switch_mm_irqs_off(NULL, mm, current); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 714233cee0b5..1b98f8c12b96 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -33,7 +33,7 @@ static union uvh_apicid uvh_apicid; static int uv_node_id; /* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */ -static u8 uv_archtype[UV_AT_SIZE]; +static u8 uv_archtype[UV_AT_SIZE + 1]; static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; @@ -290,6 +290,9 @@ static void __init uv_stringify(int len, char *to, char *from) { /* Relies on 'to' being NULL chars so result will be NULL terminated */ strncpy(to, from, len-1); + + /* Trim trailing spaces */ + (void)strim(to); } /* Find UV arch type entry in UVsystab */ @@ -317,7 +320,7 @@ static int __init decode_arch_type(unsigned long ptr) if (n > 0 && n < sizeof(uv_ate->archtype)) { pr_info("UV: UVarchtype received from BIOS\n"); - uv_stringify(UV_AT_SIZE, uv_archtype, uv_ate->archtype); + uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype); return 1; } return 0; @@ -366,7 +369,7 @@ static int __init early_get_arch_type(void) return ret; } -static int __init uv_set_system_type(char *_oem_id) +static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) { /* Save OEM_ID passed from ACPI MADT */ uv_stringify(sizeof(oem_id), oem_id, _oem_id); @@ -375,7 +378,7 @@ static int __init uv_set_system_type(char *_oem_id) if (!early_get_arch_type()) /* If not use OEM ID for UVarchtype */ - uv_stringify(UV_AT_SIZE, uv_archtype, _oem_id); + uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id); /* Check if not hubbed */ if (strncmp(uv_archtype, "SGI", 3) != 0) { @@ -386,13 +389,23 @@ static int __init uv_set_system_type(char *_oem_id) /* (Not hubless), not a UV */ return 0; + /* Is UV hubless system */ + uv_hubless_system = 0x01; + + /* UV5 Hubless */ + if (strncmp(uv_archtype, "NSGI5", 5) == 0) + uv_hubless_system |= 0x20; + /* UV4 Hubless: CH */ - if (strncmp(uv_archtype, "NSGI4", 5) == 0) - uv_hubless_system = 0x11; + else if (strncmp(uv_archtype, "NSGI4", 5) == 0) + uv_hubless_system |= 0x10; /* UV3 Hubless: UV300/MC990X w/o hub */ else - uv_hubless_system = 0x9; + uv_hubless_system |= 0x8; + + /* Copy APIC type */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", oem_id, oem_table_id, uv_system_type, uv_hubless_system); @@ -456,7 +469,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; /* If not UV, return. */ - if (likely(uv_set_system_type(_oem_id) == 0)) + if (uv_set_system_type(_oem_id, _oem_table_id) == 0) return 0; /* Save and Decode OEM Table ID */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d3f0db463f96..d41b70fe4918 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -739,11 +739,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + spectre_v2_user_ibpb = mode; switch (cmd) { case SPECTRE_V2_USER_CMD_FORCE: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: static_branch_enable(&switch_mm_always_ibpb); + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_AUTO: @@ -757,8 +759,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", static_key_enabled(&switch_mm_always_ibpb) ? "always-on" : "conditional"); - - spectre_v2_user_ibpb = mode; } /* @@ -1254,6 +1254,14 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) return 0; } +static bool is_spec_ib_user_controlled(void) +{ + return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; +} + static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { @@ -1261,16 +1269,26 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; + /* - * Indirect branch speculation is always disabled in strict - * mode. It can neither be enabled if it was force-disabled - * by a previous prctl call. + * With strict mode for both IBPB and STIBP, the instruction + * code paths avoid checking this task flag and instead, + * unconditionally run the instruction. However, STIBP and IBPB + * are independent and either can be set to conditionally + * enabled regardless of the mode of the other. + * + * If either is set to conditional, allow the task flag to be + * updated, unless it was force-disabled by a previous prctl + * call. Currently, this is possible on an AMD CPU which has the + * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the + * kernel is booted with 'spectre_v2_user=seccomp', then + * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and + * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED. */ - if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + if (!is_spec_ib_user_controlled() || task_spec_ib_force_disable(task)) return -EPERM; + task_clear_spec_ib_disable(task); task_update_spec_tif(task); break; @@ -1283,10 +1301,10 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; - if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + + if (!is_spec_ib_user_controlled()) return 0; + task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); @@ -1351,20 +1369,17 @@ static int ib_prctl_get(struct task_struct *task) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; - else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) - return PR_SPEC_DISABLE; - else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || - spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || - spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || - spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) { + else if (is_spec_ib_user_controlled()) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - } else + } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + return PR_SPEC_DISABLE; + else return PR_SPEC_NOT_AFFECTED; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4102b866e7c0..32b7099e3511 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1384,8 +1384,10 @@ noinstr void do_machine_check(struct pt_regs *regs) * When there's any problem use only local no_way_out state. */ if (!lmce) { - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; + if (mce_end(order) < 0) { + if (!no_way_out) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } } else { /* * If there was a fatal machine check we should have diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 6a99535d7f37..7e8e07bddd5f 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -100,53 +100,6 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev return find_matching_signature(mc, csig, cpf); } -/* - * Given CPU signature and a microcode patch, this function finds if the - * microcode patch has matching family and model with the CPU. - * - * %true - if there's a match - * %false - otherwise - */ -static bool microcode_matches(struct microcode_header_intel *mc_header, - unsigned long sig) -{ - unsigned long total_size = get_totalsize(mc_header); - unsigned long data_size = get_datasize(mc_header); - struct extended_sigtable *ext_header; - unsigned int fam_ucode, model_ucode; - struct extended_signature *ext_sig; - unsigned int fam, model; - int ext_sigcount, i; - - fam = x86_family(sig); - model = x86_model(sig); - - fam_ucode = x86_family(mc_header->sig); - model_ucode = x86_model(mc_header->sig); - - if (fam == fam_ucode && model == model_ucode) - return true; - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - return false; - - ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - ext_sigcount = ext_header->count; - - for (i = 0; i < ext_sigcount; i++) { - fam_ucode = x86_family(ext_sig->sig); - model_ucode = x86_model(ext_sig->sig); - - if (fam == fam_ucode && model == model_ucode) - return true; - - ext_sig++; - } - return false; -} - static struct ucode_patch *memdup_patch(void *data, unsigned int size) { struct ucode_patch *p; @@ -164,7 +117,7 @@ static struct ucode_patch *memdup_patch(void *data, unsigned int size) return p; } -static void save_microcode_patch(void *data, unsigned int size) +static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size) { struct microcode_header_intel *mc_hdr, *mc_saved_hdr; struct ucode_patch *iter, *tmp, *p = NULL; @@ -210,6 +163,9 @@ static void save_microcode_patch(void *data, unsigned int size) if (!p) return; + if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) + return; + /* * Save for early loading. On 32-bit, that needs to be a physical * address as the APs are running from physical addresses, before @@ -344,13 +300,14 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) size -= mc_size; - if (!microcode_matches(mc_header, uci->cpu_sig.sig)) { + if (!find_matching_signature(data, uci->cpu_sig.sig, + uci->cpu_sig.pf)) { data += mc_size; continue; } if (save) { - save_microcode_patch(data, mc_size); + save_microcode_patch(uci, data, mc_size); goto next; } @@ -483,14 +440,14 @@ static void show_saved_mc(void) * Save this microcode patch. It will be loaded early when a CPU is * hot-added or resumes. */ -static void save_mc_for_early(u8 *mc, unsigned int size) +static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size) { /* Synchronization during CPU hotplug. */ static DEFINE_MUTEX(x86_cpu_microcode_mutex); mutex_lock(&x86_cpu_microcode_mutex); - save_microcode_patch(mc, size); + save_microcode_patch(uci, mc, size); show_saved_mc(); mutex_unlock(&x86_cpu_microcode_mutex); @@ -935,7 +892,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) * permanent memory. So it will be loaded early when a CPU is hot added * or resumes. */ - save_mc_for_early(new_mc, new_mc_size); + save_mc_for_early(uci, new_mc, new_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 78dcbb8e0172..05a026d4420b 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -507,6 +507,24 @@ unlock: return ret ?: nbytes; } +/** + * rdtgroup_remove - the helper to remove resource group safely + * @rdtgrp: resource group to remove + * + * On resource group creation via a mkdir, an extra kernfs_node reference is + * taken to ensure that the rdtgroup structure remains accessible for the + * rdtgroup_kn_unlock() calls where it is removed. + * + * Drop the extra reference here, then free the rdtgroup structure. + * + * Return: void + */ +static void rdtgroup_remove(struct rdtgroup *rdtgrp) +{ + kernfs_put(rdtgrp->kn); + kfree(rdtgrp); +} + struct task_move_callback { struct callback_head work; struct rdtgroup *rdtgrp; @@ -529,7 +547,7 @@ static void move_myself(struct callback_head *head) (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; current->rmid = 0; - kfree(rdtgrp); + rdtgroup_remove(rdtgrp); } if (unlikely(current->flags & PF_EXITING)) @@ -1769,7 +1787,6 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, if (IS_ERR(kn_subdir)) return PTR_ERR(kn_subdir); - kernfs_get(kn_subdir); ret = rdtgroup_kn_set_ugid(kn_subdir); if (ret) return ret; @@ -1792,7 +1809,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); if (IS_ERR(kn_info)) return PTR_ERR(kn_info); - kernfs_get(kn_info); ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); if (ret) @@ -1813,12 +1829,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) goto out_destroy; } - /* - * This extra ref will be put in kernfs_remove() and guarantees - * that @rdtgrp->kn is always accessible. - */ - kernfs_get(kn_info); - ret = rdtgroup_kn_set_ugid(kn_info); if (ret) goto out_destroy; @@ -1847,12 +1857,6 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, if (dest_kn) *dest_kn = kn; - /* - * This extra ref will be put in kernfs_remove() and guarantees - * that @rdtgrp->kn is always accessible. - */ - kernfs_get(kn); - ret = rdtgroup_kn_set_ugid(kn); if (ret) goto out_destroy; @@ -2079,8 +2083,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) rdtgroup_pseudo_lock_remove(rdtgrp); kernfs_unbreak_active_protection(kn); - kernfs_put(rdtgrp->kn); - kfree(rdtgrp); + rdtgroup_remove(rdtgrp); } else { kernfs_unbreak_active_protection(kn); } @@ -2139,13 +2142,11 @@ static int rdt_get_tree(struct fs_context *fc) &kn_mongrp); if (ret < 0) goto out_info; - kernfs_get(kn_mongrp); ret = mkdir_mondata_all(rdtgroup_default.kn, &rdtgroup_default, &kn_mondata); if (ret < 0) goto out_mongrp; - kernfs_get(kn_mondata); rdtgroup_default.mon.mon_data_kn = kn_mondata; } @@ -2357,7 +2358,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) if (atomic_read(&sentry->waitcount) != 0) sentry->flags = RDT_DELETED; else - kfree(sentry); + rdtgroup_remove(sentry); } } @@ -2399,7 +2400,7 @@ static void rmdir_all_sub(void) if (atomic_read(&rdtgrp->waitcount) != 0) rdtgrp->flags = RDT_DELETED; else - kfree(rdtgrp); + rdtgroup_remove(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ update_closid_rmid(cpu_online_mask, &rdtgroup_default); @@ -2499,11 +2500,6 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, if (IS_ERR(kn)) return PTR_ERR(kn); - /* - * This extra ref will be put in kernfs_remove() and guarantees - * that kn is always accessible. - */ - kernfs_get(kn); ret = rdtgroup_kn_set_ugid(kn); if (ret) goto out_destroy; @@ -2838,8 +2834,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, /* * kernfs_remove() will drop the reference count on "kn" which * will free it. But we still need it to stick around for the - * rdtgroup_kn_unlock(kn} call below. Take one extra reference - * here, which will be dropped inside rdtgroup_kn_unlock(). + * rdtgroup_kn_unlock(kn) call. Take one extra reference here, + * which will be dropped by kernfs_put() in rdtgroup_remove(). */ kernfs_get(kn); @@ -2880,6 +2876,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, out_idfree: free_rmid(rdtgrp->mon.rmid); out_destroy: + kernfs_put(rdtgrp->kn); kernfs_remove(rdtgrp->kn); out_free_rgrp: kfree(rdtgrp); @@ -2892,7 +2889,7 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) { kernfs_remove(rgrp->kn); free_rmid(rgrp->mon.rmid); - kfree(rgrp); + rdtgroup_remove(rgrp); } /* @@ -3049,11 +3046,6 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); list_del(&rdtgrp->mon.crdtgrp_list); - /* - * one extra hold on this, will drop when we kfree(rdtgrp) - * in rdtgroup_kn_unlock() - */ - kernfs_get(kn); kernfs_remove(rdtgrp->kn); return 0; @@ -3065,11 +3057,6 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn, rdtgrp->flags = RDT_DELETED; list_del(&rdtgrp->rdtgroup_list); - /* - * one extra hold on this, will drop when we kfree(rdtgrp) - * in rdtgroup_kn_unlock() - */ - kernfs_get(kn); kernfs_remove(rdtgrp->kn); return 0; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 25c06b67e7e0..97aa900386cb 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -78,6 +78,9 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, if (!user_mode(regs)) return copy_from_kernel_nofault(buf, (u8 *)src, nbytes); + /* The user space code from other tasks cannot be accessed. */ + if (regs != task_pt_regs(current)) + return -EPERM; /* * Make sure userspace isn't trying to trick us into dumping kernel * memory by pointing the userspace instruction pointer at it. @@ -85,6 +88,12 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX)) return -EINVAL; + /* + * Even if named copy_from_user_nmi() this can be invoked from + * other contexts and will not try to resolve a pagefault, which is + * the correct thing to do here as this code can be called from any + * context. + */ return copy_from_user_nmi(buf, (void __user *)src, nbytes); } @@ -115,13 +124,19 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) u8 opcodes[OPCODE_BUFSIZE]; unsigned long prologue = regs->ip - PROLOGUE_SIZE; - if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { - printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", - loglvl, prologue); - } else { + switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { + case 0: printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes, opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1); + break; + case -EPERM: + /* No access to the user space stack of other tasks. Ignore. */ + break; + default: + printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", + loglvl, prologue); + break; } } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7eb2a1c87969..3c417734790f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -161,6 +161,21 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Setup early boot stage 4-/5-level pagetables. */ addq phys_base(%rip), %rax + + /* + * For SEV guests: Verify that the C-bit is correct. A malicious + * hypervisor could lie about the C-bit position to perform a ROP + * attack on the guest by writing to the unencrypted stack and wait for + * the next RET instruction. + * %rsi carries pointer to realmode data and is callee-clobbered. Save + * and restore it. + */ + pushq %rsi + movq %rax, %rdi + call sev_verify_cbit + popq %rsi + + /* Switch to new page-table */ movq %rax, %cr3 /* Ensure I am executing from virtual addresses */ @@ -279,6 +294,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" +#include "sev_verify_cbit.S" #ifdef CONFIG_HOTPLUG_CPU /* diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 57c2ecf43134..ce831f9448e7 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -200,8 +200,7 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; /* Copying screen_info will do? */ - memcpy(¶ms->screen_info, &boot_params.screen_info, - sizeof(struct screen_info)); + memcpy(¶ms->screen_info, &screen_info, sizeof(struct screen_info)); /* Fill in memsize later */ params->screen_info.ext_mem_k = 0; diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index bb7e1132290b..f9e5352b3bef 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); @@ -129,12 +128,20 @@ u64 perf_reg_abi(struct task_struct *task) return PERF_SAMPLE_REGS_ABI_64; } +static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs); + void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { + struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs); struct pt_regs *user_regs = task_pt_regs(current); + if (!in_nmi()) { + regs_user->regs = user_regs; + regs_user->abi = perf_reg_abi(current); + return; + } + /* * If we're in an NMI that interrupted task_pt_regs setup, then * we can't sample user regs at all. This check isn't really diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ba4593a913fa..145a7ac0c19a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -685,7 +685,7 @@ void arch_cpu_idle(void) */ void __cpuidle default_idle(void) { - safe_halt(); + raw_safe_halt(); } #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); @@ -736,6 +736,8 @@ void stop_this_cpu(void *dummy) /* * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power * states (local apic timer and TSC stop). + * + * XXX this function is completely buggered vs RCU and tracing. */ static void amd_e400_idle(void) { @@ -757,9 +759,9 @@ static void amd_e400_idle(void) * The switch back from broadcast mode needs to be called with * interrupts disabled. */ - local_irq_disable(); + raw_local_irq_disable(); tick_broadcast_exit(); - local_irq_enable(); + raw_local_irq_enable(); } /* @@ -801,9 +803,9 @@ static __cpuidle void mwait_idle(void) if (!need_resched()) __sti_mwait(0, 0); else - local_irq_enable(); + raw_local_irq_enable(); } else { - local_irq_enable(); + raw_local_irq_enable(); } __current_clr_polling(); } diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index 5f83ccaab877..7d04b356d44d 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -178,6 +178,32 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) goto fail; regs->dx = val >> 32; + /* + * This is a VC handler and the #VC is only raised when SEV-ES is + * active, which means SEV must be active too. Do sanity checks on the + * CPUID results to make sure the hypervisor does not trick the kernel + * into the no-sev path. This could map sensitive data unencrypted and + * make it accessible to the hypervisor. + * + * In particular, check for: + * - Hypervisor CPUID bit + * - Availability of CPUID leaf 0x8000001f + * - SEV CPUID bit. + * + * The hypervisor might still report the wrong C-bit position, but this + * can't be checked here. + */ + + if ((fn == 1 && !(regs->cx & BIT(31)))) + /* Hypervisor bit */ + goto fail; + else if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + /* SEV leaf check */ + goto fail; + else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) + /* SEV bit */ + goto fail; + /* Skip over the CPUID two-byte opcode */ regs->ip += 2; diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 4a96726fbaf8..0bd1a0fc587e 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -374,8 +374,8 @@ fault: return ES_EXCEPTION; } -static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned long vaddr, phys_addr_t *paddr) +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) { unsigned long va = (unsigned long)vaddr; unsigned int level; @@ -394,15 +394,19 @@ static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, if (user_mode(ctxt->regs)) ctxt->fi.error_code |= X86_PF_USER; - return false; + return ES_EXCEPTION; } + if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) + /* Emulated MMIO to/from encrypted memory not supported */ + return ES_UNSUPPORTED; + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; pa |= va & ~page_level_mask(level); *paddr = pa; - return true; + return ES_OK; } /* Include code shared with pre-decompression boot stage */ @@ -731,6 +735,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, { u64 exit_code, exit_info_1, exit_info_2; unsigned long ghcb_pa = __pa(ghcb); + enum es_result res; phys_addr_t paddr; void __user *ref; @@ -740,11 +745,12 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; - if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) { - if (!read) + res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); + if (res != ES_OK) { + if (res == ES_EXCEPTION && !read) ctxt->fi.error_code |= X86_PF_WRITE; - return ES_EXCEPTION; + return res; } exit_info_1 = paddr; diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S new file mode 100644 index 000000000000..ee04941a6546 --- /dev/null +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sev_verify_cbit.S - Code for verification of the C-bit position reported + * by the Hypervisor when running with SEV enabled. + * + * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de) + * + * sev_verify_cbit() is called before switching to a new long-mode page-table + * at boot. + * + * Verify that the C-bit position is correct by writing a random value to + * an encrypted memory location while on the current page-table. Then it + * switches to the new page-table to verify the memory content is still the + * same. After that it switches back to the current page-table and when the + * check succeeded it returns. If the check failed the code invalidates the + * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to + * make sure no interrupt or exception can get the CPU out of the hlt loop. + * + * New page-table pointer is expected in %rdi (first parameter) + * + */ +SYM_FUNC_START(sev_verify_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* First check if a C-bit was detected */ + movq sme_me_mask(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */ + movq sev_status(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* Save CR4 in %rsi */ + movq %cr4, %rsi + + /* Disable Global Pages */ + movq %rsi, %rdx + andq $(~X86_CR4_PGE), %rdx + movq %rdx, %cr4 + + /* + * Verified that running under SEV - now get a random value using + * RDRAND. This instruction is mandatory when running as an SEV guest. + * + * Don't bail out of the loop if RDRAND returns errors. It is better to + * prevent forward progress than to work with a non-random value here. + */ +1: rdrand %rdx + jnc 1b + + /* Store value to memory and keep it in %rdx */ + movq %rdx, sev_check_data(%rip) + + /* Backup current %cr3 value to restore it later */ + movq %cr3, %rcx + + /* Switch to new %cr3 - This might unmap the stack */ + movq %rdi, %cr3 + + /* + * Compare value in %rdx with memory location. If C-bit is incorrect + * this would read the encrypted data and make the check fail. + */ + cmpq %rdx, sev_check_data(%rip) + + /* Restore old %cr3 */ + movq %rcx, %cr3 + + /* Restore previous CR4 */ + movq %rsi, %cr4 + + /* Check CMPQ result */ + je 3f + + /* + * The check failed, prevent any forward progress to prevent ROP + * attacks, invalidate the stack and go into a hlt loop. + */ + xorq %rsp, %rsp + subq $0x1000, %rsp +2: hlt + jmp 2b +3: +#endif + /* Return page-table pointer */ + movq %rdi, %rax + ret +SYM_FUNC_END(sev_verify_cbit) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 992fb1415c0f..ae64f98ec2ab 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -514,16 +514,10 @@ int tboot_force_iommu(void) if (!tboot_enabled()) return 0; - if (intel_iommu_tboot_noforce) - return 1; - - if (no_iommu || swiotlb || dmar_disabled) + if (no_iommu || dmar_disabled) pr_warn("Forcing Intel-IOMMU to enabled\n"); dmar_disabled = 0; -#ifdef CONFIG_SWIOTLB - swiotlb = 0; -#endif no_iommu = 0; return 1; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3c70fb34028b..e19df6cde35d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -793,19 +793,6 @@ static __always_inline unsigned long debug_read_clear_dr6(void) set_debugreg(DR6_RESERVED, 6); dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ - /* - * Clear the virtual DR6 value, ptrace routines will set bits here for - * things we want signals for. - */ - current->thread.virtual_dr6 = 0; - - /* - * The SDM says "The processor clears the BTF flag when it - * generates a debug exception." Clear TIF_BLOCKSTEP to keep - * TIF_BLOCKSTEP in sync with the hardware BTF flag. - */ - clear_thread_flag(TIF_BLOCKSTEP); - return dr6; } @@ -873,6 +860,20 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, */ WARN_ON_ONCE(user_mode(regs)); + if (test_thread_flag(TIF_BLOCKSTEP)) { + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." but PTRACE_BLOCKSTEP requested + * it for userspace, but we just took a kernel #DB, so re-set + * BTF. + */ + unsigned long debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl |= DEBUGCTLMSR_BTF; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. @@ -936,6 +937,22 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, instrumentation_begin(); /* + * Start the virtual/ptrace DR6 value with just the DR_STEP mask + * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits. + * + * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6) + * even if it is not the result of PTRACE_SINGLESTEP. + */ + current->thread.virtual_dr6 = (dr6 & DR_STEP); + + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_thread_flag(TIF_BLOCKSTEP); + + /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 6a339ce328e0..73f800100066 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -321,19 +321,12 @@ EXPORT_SYMBOL_GPL(unwind_get_return_address); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) { - struct task_struct *task = state->task; - if (unwind_done(state)) return NULL; if (state->regs) return &state->regs->ip; - if (task != current && state->sp == task->thread.sp) { - struct inactive_task_frame *frame = (void *)task->thread.sp; - return &frame->ret_addr; - } - if (state->sp) return (unsigned long *)state->sp - 1; @@ -663,7 +656,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } else { struct inactive_task_frame *frame = (void *)task->thread.sp; - state->sp = task->thread.sp; + state->sp = task->thread.sp + sizeof(*frame); state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); state->signal = (void *)state->ip == ret_from_fork; |