summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/apic/apic.c115
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c4
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c2
-rw-r--r--arch/x86/kernel/apic/htirq.c21
-rw-r--r--arch/x86/kernel/apic/io_apic.c44
-rw-r--r--arch/x86/kernel/apic/msi.c55
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/vector.c53
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c36
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c26
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c13
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c18
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c11
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c24
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c18
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/ftrace.c20
-rw-r--r--arch/x86/kernel/head64.c145
-rw-r--r--arch/x86/kernel/head_64.S131
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/irq.c78
-rw-r--r--arch/x86/kernel/kprobes/core.c9
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/ldt.c56
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/nmi_selftest.c2
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/process.c11
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c4
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/tsc.c210
-rw-r--r--arch/x86/kernel/tsc_sync.c21
-rw-r--r--arch/x86/kernel/unwind_frame.c49
50 files changed, 679 insertions, 571 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4b994232cb57..a01892bdd61a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg
CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
+CFLAGS_REMOVE_head64.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -29,6 +30,7 @@ OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
+OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 26b78d86f25a..85a9e17e0dbc 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
+
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c5b8f760473c..32e14d137416 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -409,8 +409,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
memcpy(insnbuf, replacement, a->replacementlen);
insnbuf_sz = a->replacementlen;
- /* 0xe8 is a relative jump; fix the offset. */
- if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+ /*
+ * 0xe8 is a relative jump; fix the offset.
+ *
+ * Instruction length is checked before the opcode to avoid
+ * accessing uninitialized bytes for zero-length replacements.
+ */
+ if (a->replacementlen == 5 && *insnbuf == 0xe8) {
*(s32 *)(insnbuf + 1) += replacement - instr;
DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
*(s32 *)(insnbuf + 1),
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2d75faf743f2..98b3dd8cf2bf 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -54,6 +54,8 @@
#include <asm/mce.h>
#include <asm/tsc.h>
#include <asm/hypervisor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
unsigned int num_processors;
@@ -545,6 +547,81 @@ static struct clock_event_device lapic_clockevent = {
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+#define DEADLINE_MODEL_MATCH_FUNC(model, func) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func }
+
+#define DEADLINE_MODEL_MATCH_REV(model, rev) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev }
+
+static u32 hsx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_mask) {
+ case 0x02: return 0x3a; /* EP */
+ case 0x04: return 0x0f; /* EX */
+ }
+
+ return ~0U;
+}
+
+static u32 bdx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_mask) {
+ case 0x02: return 0x00000011;
+ case 0x03: return 0x0700000e;
+ case 0x04: return 0x0f00000c;
+ case 0x05: return 0x0e000003;
+ }
+
+ return ~0U;
+}
+
+static const struct x86_cpu_id deadline_match[] = {
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52),
+
+ {},
+};
+
+static void apic_check_deadline_errata(void)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(deadline_match);
+ u32 rev;
+
+ if (!m)
+ return;
+
+ /*
+ * Function pointers will have the MSB set due to address layout,
+ * immediate revisions will not.
+ */
+ if ((long)m->driver_data < 0)
+ rev = ((u32 (*)(void))(m->driver_data))();
+ else
+ rev = (u32)m->driver_data;
+
+ if (boot_cpu_data.microcode >= rev)
+ return;
+
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
+ "please update microcode to version: 0x%x (or later)\n", rev);
+}
+
/*
* Setup the local APIC timer for this CPU. Copy the initialized values
* of the boot CPU and register the clock event in the framework.
@@ -563,6 +640,7 @@ static void setup_APIC_timer(void)
levt->cpumask = cpumask_of(smp_processor_id());
if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ levt->name = "lapic-deadline";
levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
CLOCK_EVT_FEAT_DUMMY);
levt->set_next_event = lapic_next_deadline;
@@ -1779,6 +1857,8 @@ void __init init_apic_mappings(void)
{
unsigned int new_apicid;
+ apic_check_deadline_errata();
+
if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
return;
@@ -2201,23 +2281,32 @@ void default_init_apic_ldr(void)
apic_write(APIC_LDR, val);
}
-int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid)
+int default_cpu_mask_to_apicid(const struct cpumask *mask,
+ struct irq_data *irqdata,
+ unsigned int *apicid)
{
- unsigned int cpu;
+ unsigned int cpu = cpumask_first(mask);
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+ *apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ irq_data_update_effective_affinity(irqdata, cpumask_of(cpu));
+ return 0;
+}
- if (likely(cpu < nr_cpu_ids)) {
- *apicid = per_cpu(x86_cpu_to_apicid, cpu);
- return 0;
- }
+int flat_cpu_mask_to_apicid(const struct cpumask *mask,
+ struct irq_data *irqdata,
+ unsigned int *apicid)
- return -EINVAL;
+{
+ struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata);
+ unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS;
+
+ if (!cpu_mask)
+ return -EINVAL;
+ *apicid = (unsigned int)cpu_mask;
+ cpumask_bits(effmsk)[0] = cpu_mask;
+ return 0;
}
/*
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index a4d7ff20ed22..dedd5a41ba48 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -172,7 +172,7 @@ static struct apic apic_flat __ro_after_init = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.send_IPI = default_send_IPI_single,
.send_IPI_mask = flat_send_IPI_mask,
@@ -268,7 +268,7 @@ static struct apic apic_physflat __ro_after_init = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = default_send_IPI_single_phys,
.send_IPI_mask = default_send_IPI_mask_sequence_phys,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 2262eb6df796..6599f437b4ab 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -141,7 +141,7 @@ struct apic apic_noop __ro_after_init = {
.get_apic_id = noop_get_apic_id,
.set_apic_id = NULL,
- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.send_IPI = noop_send_IPI,
.send_IPI_mask = noop_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index e08fe2c8dd8c..2fda912219a6 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -267,7 +267,7 @@ static const struct apic apic_numachip1 __refconst = {
.get_apic_id = numachip1_get_apic_id,
.set_apic_id = numachip1_set_apic_id,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
@@ -318,7 +318,7 @@ static const struct apic apic_numachip2 __refconst = {
.get_apic_id = numachip2_get_apic_id,
.set_apic_id = numachip2_set_apic_id,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 56012010332c..456e45e8bf84 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -172,7 +172,7 @@ static struct apic apic_bigsmp __ro_after_init = {
.get_apic_id = bigsmp_get_apic_id,
.set_apic_id = NULL,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = default_send_IPI_single_phys,
.send_IPI_mask = default_send_IPI_mask_sequence_phys,
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index ae50d3454d78..56ccf9346b08 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -150,16 +150,27 @@ static const struct irq_domain_ops htirq_domain_ops = {
.deactivate = htirq_domain_deactivate,
};
-void arch_init_htirq_domain(struct irq_domain *parent)
+void __init arch_init_htirq_domain(struct irq_domain *parent)
{
+ struct fwnode_handle *fn;
+
if (disable_apic)
return;
- htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL);
+ fn = irq_domain_alloc_named_fwnode("PCI-HT");
+ if (!fn)
+ goto warn;
+
+ htirq_domain = irq_domain_create_tree(fn, &htirq_domain_ops, NULL);
+ irq_domain_free_fwnode(fn);
if (!htirq_domain)
- pr_warn("failed to initialize irqdomain for HTIRQ.\n");
- else
- htirq_domain->parent = parent;
+ goto warn;
+
+ htirq_domain->parent = parent;
+ return;
+
+warn:
+ pr_warn("Failed to initialize irqdomain for HTIRQ.\n");
}
int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 347bb9f65737..b4f5f73febdb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
static struct irq_chip ioapic_chip, ioapic_ir_chip;
-#ifdef CONFIG_X86_32
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for_each_ioapic_pin(apic, pin) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
- return irq_trigger(idx);
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-#else
-static inline int IO_APIC_irq_trigger(int irq)
-{
- return 1;
-}
-#endif
-
static void __init setup_IO_APIC_irqs(void)
{
unsigned int ioapic, pin;
@@ -2223,6 +2201,8 @@ static int mp_irqdomain_create(int ioapic)
struct ioapic *ip = &ioapics[ioapic];
struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ struct fwnode_handle *fn;
+ char *name = "IO-APIC";
if (cfg->type == IOAPIC_DOMAIN_INVALID)
return 0;
@@ -2233,9 +2213,25 @@ static int mp_irqdomain_create(int ioapic)
parent = irq_remapping_get_ir_irq_domain(&info);
if (!parent)
parent = x86_vector_domain;
+ else
+ name = "IO-APIC-IR";
+
+ /* Handle device tree enumerated APICs proper */
+ if (cfg->dev) {
+ fn = of_node_to_fwnode(cfg->dev);
+ } else {
+ fn = irq_domain_alloc_named_id_fwnode(name, ioapic);
+ if (!fn)
+ return -ENOMEM;
+ }
+
+ ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
+ (void *)(long)ioapic);
+
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
- ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
- (void *)(long)ioapic);
if (!ip->irqdomain)
return -ENOMEM;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index c61aec7e65f4..9b18be764422 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -136,13 +136,20 @@ static struct msi_domain_info pci_msi_domain_info = {
.handler_name = "edge",
};
-void arch_init_msi_domain(struct irq_domain *parent)
+void __init arch_init_msi_domain(struct irq_domain *parent)
{
+ struct fwnode_handle *fn;
+
if (disable_apic)
return;
- msi_default_domain = pci_msi_create_irq_domain(NULL,
- &pci_msi_domain_info, parent);
+ fn = irq_domain_alloc_named_fwnode("PCI-MSI");
+ if (fn) {
+ msi_default_domain =
+ pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
+ parent);
+ irq_domain_free_fwnode(fn);
+ }
if (!msi_default_domain)
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
}
@@ -167,9 +174,18 @@ static struct msi_domain_info pci_msi_ir_domain_info = {
.handler_name = "edge",
};
-struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
+struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
+ const char *name, int id)
{
- return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent);
+ struct fwnode_handle *fn;
+ struct irq_domain *d;
+
+ fn = irq_domain_alloc_named_id_fwnode(name, id);
+ if (!fn)
+ return NULL;
+ d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
+ irq_domain_free_fwnode(fn);
+ return d;
}
#endif
@@ -221,13 +237,20 @@ static struct irq_domain *dmar_get_irq_domain(void)
{
static struct irq_domain *dmar_domain;
static DEFINE_MUTEX(dmar_lock);
+ struct fwnode_handle *fn;
mutex_lock(&dmar_lock);
- if (dmar_domain == NULL)
- dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info,
+ if (dmar_domain)
+ goto out;
+
+ fn = irq_domain_alloc_named_fwnode("DMAR-MSI");
+ if (fn) {
+ dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
x86_vector_domain);
+ irq_domain_free_fwnode(fn);
+ }
+out:
mutex_unlock(&dmar_lock);
-
return dmar_domain;
}
@@ -317,9 +340,10 @@ static struct msi_domain_info hpet_msi_domain_info = {
struct irq_domain *hpet_create_irq_domain(int hpet_id)
{
- struct irq_domain *parent;
- struct irq_alloc_info info;
struct msi_domain_info *domain_info;
+ struct irq_domain *parent, *d;
+ struct irq_alloc_info info;
+ struct fwnode_handle *fn;
if (x86_vector_domain == NULL)
return NULL;
@@ -340,7 +364,16 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
else
hpet_msi_controller.name = "IR-HPET-MSI";
- return msi_create_irq_domain(NULL, domain_info, parent);
+ fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+ hpet_id);
+ if (!fn) {
+ kfree(domain_info);
+ return NULL;
+ }
+
+ d = msi_create_irq_domain(fn, domain_info, parent);
+ irq_domain_free_fwnode(fn);
+ return d;
}
int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 2e8f7f048f4f..63287659adb6 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -102,7 +102,7 @@ static struct apic apic_default __ro_after_init = {
.get_apic_id = default_get_apic_id,
.set_apic_id = NULL,
- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.send_IPI = default_send_IPI_single,
.send_IPI_mask = default_send_IPI_mask_logical,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f3557a1eb562..b3af457ed667 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -103,7 +103,8 @@ static void free_apic_chip_data(struct apic_chip_data *data)
}
static int __assign_irq_vector(int irq, struct apic_chip_data *d,
- const struct cpumask *mask)
+ const struct cpumask *mask,
+ struct irq_data *irqdata)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -141,7 +142,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
/*
* Clear the offline cpus from @vector_cpumask for searching
* and verify whether the result overlaps with @mask. If true,
- * then the call to apic->cpu_mask_to_apicid_and() will
+ * then the call to apic->cpu_mask_to_apicid() will
* succeed as well. If not, no point in trying to find a
* vector in this mask.
*/
@@ -221,34 +222,40 @@ success:
* Cache destination APIC IDs into cfg->dest_apicid. This cannot fail
* as we already established, that mask & d->domain & cpu_online_mask
* is not empty.
+ *
+ * vector_searchmask is a subset of d->domain and has the offline
+ * cpus masked out.
*/
- BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain,
- &d->cfg.dest_apicid));
+ cpumask_and(vector_searchmask, vector_searchmask, mask);
+ BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqdata,
+ &d->cfg.dest_apicid));
return 0;
}
static int assign_irq_vector(int irq, struct apic_chip_data *data,
- const struct cpumask *mask)
+ const struct cpumask *mask,
+ struct irq_data *irqdata)
{
int err;
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, data, mask);
+ err = __assign_irq_vector(irq, data, mask, irqdata);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
static int assign_irq_vector_policy(int irq, int node,
struct apic_chip_data *data,
- struct irq_alloc_info *info)
+ struct irq_alloc_info *info,
+ struct irq_data *irqdata)
{
if (info && info->mask)
- return assign_irq_vector(irq, data, info->mask);
+ return assign_irq_vector(irq, data, info->mask, irqdata);
if (node != NUMA_NO_NODE &&
- assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+ assign_irq_vector(irq, data, cpumask_of_node(node), irqdata) == 0)
return 0;
- return assign_irq_vector(irq, data, apic->target_cpus());
+ return assign_irq_vector(irq, data, apic->target_cpus(), irqdata);
}
static void clear_irq_vector(int irq, struct apic_chip_data *data)
@@ -360,9 +367,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
irq_data->chip = &lapic_controller;
irq_data->chip_data = data;
irq_data->hwirq = virq + i;
- err = assign_irq_vector_policy(virq + i, node, data, info);
+ err = assign_irq_vector_policy(virq + i, node, data, info,
+ irq_data);
if (err)
goto error;
+ /*
+ * If the apic destination mode is physical, then the
+ * effective affinity is restricted to a single target
+ * CPU. Mark the interrupt accordingly.
+ */
+ if (!apic->irq_dest_mode)
+ irqd_set_single_target(irq_data);
}
return 0;
@@ -405,7 +420,7 @@ int __init arch_probe_nr_irqs(void)
}
#ifdef CONFIG_X86_IO_APIC
-static void init_legacy_irqs(void)
+static void __init init_legacy_irqs(void)
{
int i, node = cpu_to_node(0);
struct apic_chip_data *data;
@@ -424,16 +439,21 @@ static void init_legacy_irqs(void)
}
}
#else
-static void init_legacy_irqs(void) { }
+static inline void init_legacy_irqs(void) { }
#endif
int __init arch_early_irq_init(void)
{
+ struct fwnode_handle *fn;
+
init_legacy_irqs();
- x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
- NULL);
+ fn = irq_domain_alloc_named_fwnode("VECTOR");
+ BUG_ON(!fn);
+ x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
+ NULL);
BUG_ON(x86_vector_domain == NULL);
+ irq_domain_free_fwnode(fn);
irq_set_default_host(x86_vector_domain);
arch_init_msi_domain(x86_vector_domain);
@@ -529,11 +549,12 @@ static int apic_set_affinity(struct irq_data *irq_data,
if (!cpumask_intersects(dest, cpu_online_mask))
return -EINVAL;
- err = assign_irq_vector(irq, data, dest);
+ err = assign_irq_vector(irq, data, dest, irq_data);
return err ? err : IRQ_SET_MASK_OK;
}
static struct irq_chip lapic_controller = {
+ .name = "APIC",
.irq_ack = apic_ack_edge,
.irq_set_affinity = apic_set_affinity,
.irq_retrigger = apic_retrigger_irq,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 5a35f208ed95..481237cb1544 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -4,6 +4,7 @@
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/dmar.h>
+#include <linux/irq.h>
#include <linux/cpu.h>
#include <asm/smp.h>
@@ -104,35 +105,30 @@ static void x2apic_send_IPI_all(int vector)
}
static int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid)
+x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata,
+ unsigned int *apicid)
{
+ struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata);
+ unsigned int cpu;
u32 dest = 0;
u16 cluster;
- int i;
-
- for_each_cpu_and(i, cpumask, andmask) {
- if (!cpumask_test_cpu(i, cpu_online_mask))
- continue;
- dest = per_cpu(x86_cpu_to_logical_apicid, i);
- cluster = x2apic_cluster(i);
- break;
- }
- if (!dest)
+ cpu = cpumask_first(mask);
+ if (cpu >= nr_cpu_ids)
return -EINVAL;
- for_each_cpu_and(i, cpumask, andmask) {
- if (!cpumask_test_cpu(i, cpu_online_mask))
- continue;
- if (cluster != x2apic_cluster(i))
+ dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+ cluster = x2apic_cluster(cpu);
+
+ cpumask_clear(effmsk);
+ for_each_cpu(cpu, mask) {
+ if (cluster != x2apic_cluster(cpu))
continue;
- dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+ dest |= per_cpu(x86_cpu_to_logical_apicid, cpu);
+ cpumask_set_cpu(cpu, effmsk);
}
*apicid = dest;
-
return 0;
}
@@ -256,7 +252,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
.send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index ff111f05a314..3baf0c3dc875 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -127,7 +127,7 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index b487b3a01615..0d57bb9079c9 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -526,27 +526,15 @@ static void uv_init_apic_ldr(void)
}
static int
-uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid)
+uv_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata,
+ unsigned int *apicid)
{
- int unsigned cpu;
+ int ret = default_cpu_mask_to_apicid(mask, irqdata, apicid);
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
-
- if (likely(cpu < nr_cpu_ids)) {
- *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
- return 0;
- }
+ if (!ret)
+ *apicid |= uv_apicid_hibits;
- return -EINVAL;
+ return ret;
}
static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -614,7 +602,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = set_apic_id,
- .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
.send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index a70fd61095f8..6f077445647a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -255,6 +255,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+ case 11: /* GX1 with inverted Device ID */
#ifdef CONFIG_PCI
{
u32 vendor, device;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index f5af0cc7eb0d..9257bd9dc664 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -856,11 +856,13 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
dentry = kernfs_mount(fs_type, flags, rdt_root,
RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
- goto out_cdp;
+ goto out_destroy;
static_branch_enable(&rdt_enable_key);
goto out;
+out_destroy:
+ kernfs_remove(kn_info);
out_cdp:
cdp_disable();
out:
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5abd4bf73d6e..5cfbaeb6529a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -499,16 +499,14 @@ static int mce_usable_address(struct mce *m)
return 1;
}
-static bool memory_error(struct mce *m)
+bool mce_is_memory_error(struct mce *m)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (m->cpuvendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return (xec == 0x0 || xec == 0x8);
- } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ } else if (m->cpuvendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
@@ -529,6 +527,7 @@ static bool memory_error(struct mce *m)
return false;
}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
static bool cec_add_mce(struct mce *m)
{
@@ -536,7 +535,7 @@ static bool cec_add_mce(struct mce *m)
return false;
/* We eat only correctable DRAM errors with usable addresses. */
- if (memory_error(m) &&
+ if (mce_is_memory_error(m) &&
!(m->status & MCI_STATUS_UC) &&
mce_usable_address(m))
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
@@ -713,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
- if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
+ if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
if (m.status & MCI_STATUS_ADDRV)
m.severity = severity;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 45db4d2ebd01..21b185793c80 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -251,7 +251,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
#endif
}
-void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
{
struct ucode_cpu_info *uci;
struct cpio_data cp;
@@ -320,7 +320,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
{
@@ -338,8 +338,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
if (!desc.mc)
return -EINVAL;
- ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
- desc.data, desc.size);
+ ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
if (ret != UCODE_OK)
return -EINVAL;
@@ -675,7 +674,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
@@ -689,8 +688,8 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
#ifdef CONFIG_X86_32
/* save BSP's matching patch for early load */
- if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
- struct ucode_patch *p = find_patch(cpu);
+ if (save) {
+ struct ucode_patch *p = find_patch(0);
if (p) {
memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
@@ -722,11 +721,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
/* reload ucode container only on the boot cpu */
- if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+ if (!refresh_fw || !bsp)
return UCODE_OK;
if (c->x86 >= 0x15)
@@ -743,7 +743,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
goto fw_release;
}
- ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index e53d3c909840..9cb98ee103db 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -290,6 +290,17 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
return (struct cpio_data){ NULL, 0, "" };
if (initrd_start)
start = initrd_start;
+ } else {
+ /*
+ * The picture with physical addresses is a bit different: we
+ * need to get the *physical* address to which the ramdisk was
+ * relocated, i.e., relocated_ramdisk (not initrd_start) and
+ * since we're running from physical addresses, we need to access
+ * relocated_ramdisk through its *physical* address too.
+ */
+ u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk);
+ if (*rr)
+ start = *rr;
}
return find_cpio_data(path, (void *)start, size, NULL);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index afdfd237b59f..59edbe9d4ccb 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -42,7 +42,7 @@
static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
/* Current microcode patch used in early patching on the APs. */
-struct microcode_intel *intel_ucode_patch;
+static struct microcode_intel *intel_ucode_patch;
static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
unsigned int s2, unsigned int p2)
@@ -166,7 +166,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
static void save_microcode_patch(void *data, unsigned int size)
{
struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- struct ucode_patch *iter, *tmp, *p;
+ struct ucode_patch *iter, *tmp, *p = NULL;
bool prev_found = false;
unsigned int sig, pf;
@@ -202,6 +202,18 @@ static void save_microcode_patch(void *data, unsigned int size)
else
list_add_tail(&p->plist, &microcode_cache);
}
+
+ /*
+ * Save for early loading. On 32-bit, that needs to be a physical
+ * address as the APs are running from physical addresses, before
+ * paging has been enabled.
+ */
+ if (p) {
+ if (IS_ENABLED(CONFIG_X86_32))
+ intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ else
+ intel_ucode_patch = p->data;
+ }
}
static int microcode_sanity_check(void *mc, int print_err)
@@ -607,6 +619,14 @@ int __init save_microcode_in_initrd_intel(void)
struct ucode_cpu_info uci;
struct cpio_data cp;
+ /*
+ * initrd is going away, clear patch ptr. We will scan the microcode one
+ * last time before jettisoning and save a patch, if found. Then we will
+ * update that pointer too, with a stable patch address to use when
+ * resuming the cores.
+ */
+ intel_ucode_patch = NULL;
+
if (!load_builtin_intel_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path, false);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 04cb8d34ccb8..70e717fccdd6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -161,6 +161,15 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
}
#endif
+static unsigned long hv_get_tsc_khz(void)
+{
+ unsigned long freq;
+
+ rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+
+ return freq / 1000;
+}
+
static void __init ms_hyperv_init_platform(void)
{
int hv_host_info_eax;
@@ -193,8 +202,15 @@ static void __init ms_hyperv_init_platform(void)
hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
}
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ x86_platform.calibrate_tsc = hv_get_tsc_khz;
+ x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ }
+
#ifdef CONFIG_X86_LOCAL_APIC
- if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
/*
* Get the APIC frequency.
*/
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8e598a1ad986..6b91e2eb8d3f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
p4d_t *p4d;
/* Install the espfix pud into the kernel page directory */
- pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
p4d_populate(&init_mm, p4d, espfix_pud_page);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 0651e974dcb3..9bef1bbeba63 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -689,8 +689,12 @@ static inline void *alloc_tramp(unsigned long size)
{
return module_alloc(size);
}
-static inline void tramp_free(void *tramp)
+static inline void tramp_free(void *tramp, int size)
{
+ int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ set_memory_nx((unsigned long)tramp, npages);
+ set_memory_rw((unsigned long)tramp, npages);
module_memfree(tramp);
}
#else
@@ -699,7 +703,7 @@ static inline void *alloc_tramp(unsigned long size)
{
return NULL;
}
-static inline void tramp_free(void *tramp) { }
+static inline void tramp_free(void *tramp, int size) { }
#endif
/* Defined as markers to the end of the ftrace default trampolines */
@@ -771,7 +775,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* Copy ftrace_caller onto the trampoline memory */
ret = probe_kernel_read(trampoline, (void *)start_offset, size);
if (WARN_ON(ret < 0)) {
- tramp_free(trampoline);
+ tramp_free(trampoline, *tramp_size);
return 0;
}
@@ -797,7 +801,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* Are we pointing to the reference? */
if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) {
- tramp_free(trampoline);
+ tramp_free(trampoline, *tramp_size);
return 0;
}
@@ -839,7 +843,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
unsigned long offset;
unsigned long ip;
unsigned int size;
- int ret;
+ int ret, npages;
if (ops->trampoline) {
/*
@@ -848,11 +852,14 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
*/
if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
+ npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
+ set_memory_rw(ops->trampoline, npages);
} else {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
+ npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
}
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
@@ -863,6 +870,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
+ set_memory_ro(ops->trampoline, npages);
/* The update should never fail */
WARN_ON(ret);
@@ -939,7 +947,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
- tramp_free((void *)ops->trampoline);
+ tramp_free((void *)ops->trampoline, ops->trampoline_size);
ops->trampoline = 0;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..46c3c73e7f43 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -33,17 +33,120 @@
/*
* Manage page tables very early on.
*/
-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
-static unsigned int __initdata next_early_pgt = 2;
+static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+#define __head __section(.head.text)
+
+static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
+{
+ return ptr - (void *)_text + (void *)physaddr;
+}
+
+void __head __startup_64(unsigned long physaddr)
+{
+ unsigned long load_delta, *p;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ int i;
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_PAGE_MASK)
+ for (;;);
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = fixup_pointer(&early_top_pgt, physaddr);
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+ p4d[511] += load_delta;
+ }
+
+ pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+ pud[510] += load_delta;
+ pud[511] += load_delta;
+
+ pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
+ pmd[506] += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+ pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
+ pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+
+ i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
+ p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+ p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+ pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ }
+
+ i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
+ pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
+ pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
+ pmd[idx] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ */
+
+ pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+ }
+
+ /* Fixup phys_base */
+ p = fixup_pointer(&phys_base, physaddr);
+ *p += load_delta;
+}
+
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
{
- memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
+ memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0;
- write_cr3(__pa_nodebug(early_level4_pgt));
+ write_cr3(__pa_nodebug(early_top_pgt));
}
/* Create a new PMD entry */
@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address)
{
unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p;
+ p4dval_t p4d, *p4d_p;
pudval_t pud, *pud_p;
pmdval_t pmd, *pmd_p;
/* Invalid address or early pgt is done ? */
- if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
return -1;
again:
- pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
+ pgd_p = &early_top_pgt[pgd_index(address)].pgd;
pgd = *pgd_p;
/*
@@ -67,8 +171,25 @@ again:
* critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever...
*/
- if (pgd)
- pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ p4d_p = pgd_p;
+ else if (pgd)
+ p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ p4d_p += p4d_index(address);
+ p4d = *p4d_p;
+
+ if (p4d)
+ pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
@@ -77,7 +198,7 @@ again:
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
pud_p += pud_index(address);
pud = *pud_p;
@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
- clear_page(init_level4_pgt);
+ clear_page(init_top_pgt);
kasan_early_init();
@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- /* set init_level4_pgt kernel high mapping*/
- init_level4_pgt[511] = early_level4_pgt[511];
+ /* set init_top_pgt kernel high mapping*/
+ init_top_pgt[511] = early_top_pgt[511];
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ac9d327d2e42..6225550883df 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,10 +37,11 @@
*
*/
+#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
-L4_START_KERNEL = pgd_index(__START_KERNEL_map)
+PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
+PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
@@ -72,101 +73,12 @@ startup_64:
/* Sanitize CPU configuration */
call verify_cpu
- /*
- * Compute the delta between the address I am compiled to run at and the
- * address I am actually running at.
- */
- leaq _text(%rip), %rbp
- subq $_text - __START_KERNEL_map, %rbp
-
- /* Is the address not 2M aligned? */
- testl $~PMD_PAGE_MASK, %ebp
- jnz bad_address
-
- /*
- * Is the address too large?
- */
- leaq _text(%rip), %rax
- shrq $MAX_PHYSMEM_BITS, %rax
- jnz bad_address
-
- /*
- * Fixup the physical addresses in the page table
- */
- addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
-
- addq %rbp, level3_kernel_pgt + (510*8)(%rip)
- addq %rbp, level3_kernel_pgt + (511*8)(%rip)
-
- addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
-
- /*
- * Set up the identity mapping for the switchover. These
- * entries should *NOT* have the global bit set! This also
- * creates a bunch of nonsense entries but that is fine --
- * it avoids problems around wraparound.
- */
leaq _text(%rip), %rdi
- leaq early_level4_pgt(%rip), %rbx
-
- movq %rdi, %rax
- shrq $PGDIR_SHIFT, %rax
-
- leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
- movq %rdx, 0(%rbx,%rax,8)
- movq %rdx, 8(%rbx,%rax,8)
-
- addq $PAGE_SIZE, %rdx
- movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, PAGE_SIZE(%rbx,%rax,8)
- incl %eax
- andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, PAGE_SIZE(%rbx,%rax,8)
-
- addq $PAGE_SIZE * 2, %rbx
- movq %rdi, %rax
- shrq $PMD_SHIFT, %rdi
- addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
- leaq (_end - 1)(%rip), %rcx
- shrq $PMD_SHIFT, %rcx
- subq %rdi, %rcx
- incl %ecx
-
-1:
- andq $(PTRS_PER_PMD - 1), %rdi
- movq %rax, (%rbx,%rdi,8)
- incq %rdi
- addq $PMD_SIZE, %rax
- decl %ecx
- jnz 1b
-
- test %rbp, %rbp
- jz .Lskip_fixup
+ pushq %rsi
+ call __startup_64
+ popq %rsi
- /*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
- */
- leaq level2_kernel_pgt(%rip), %rdi
- leaq PAGE_SIZE(%rdi), %r8
- /* See if it is a valid page table entry */
-1: testb $_PAGE_PRESENT, 0(%rdi)
- jz 2f
- addq %rbp, 0(%rdi)
- /* Go to the next page */
-2: addq $8, %rdi
- cmp %r8, %rdi
- jne 1b
-
- /* Fixup phys_base */
- addq %rbp, phys_base(%rip)
-
-.Lskip_fixup:
- movq $(early_level4_pgt - __START_KERNEL_map), %rax
+ movq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
/*
@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64)
/* Sanitize CPU configuration */
call verify_cpu
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
+ movq $(init_top_pgt - __START_KERNEL_map), %rax
1:
- /* Enable PAE mode and PGE */
+ /* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+#ifdef CONFIG_X86_5LEVEL
+ orl $X86_CR4_LA57, %ecx
+#endif
movq %rcx, %cr4
- /* Setup early boot stage 4 level pagetables. */
+ /* Setup early boot stage 4-/5-level pagetables. */
addq phys_base(%rip), %rax
movq %rax, %cr3
@@ -417,9 +332,13 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PAGE(early_top_pgt)
.fill 511,8,0
+#ifdef CONFIG_X86_5LEVEL
+ .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
.fill 512,8,0
#else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
+ .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
+ .org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt)
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif
+#ifdef CONFIG_X86_5LEVEL
+NEXT_PAGE(level4_kernel_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
+
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 89ff7af2de50..16f82a3aaec7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -285,7 +285,7 @@ static void hpet_legacy_clockevent_register(void)
* Start hpet with the boot cpu mask and make it
* global after the IO_APIC has been initialized.
*/
- hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
+ hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index);
clockevents_config_and_register(&hpet_clockevent, hpet_freq,
HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
global_clock_event = &hpet_clockevent;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f34fe7444836..4aa03c5a14c9 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -432,84 +432,12 @@ int check_irq_vectors_for_cpu_disable(void)
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
- unsigned int irq, vector;
- static int warned;
+ unsigned int irr, vector;
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
- int ret;
- for_each_irq_desc(irq, desc) {
- int break_affinity = 0;
- int set_affinity = 1;
- const struct cpumask *affinity;
-
- if (!desc)
- continue;
- if (irq == 2)
- continue;
-
- /* interrupt's are disabled at this point */
- raw_spin_lock(&desc->lock);
-
- data = irq_desc_get_irq_data(desc);
- affinity = irq_data_get_affinity_mask(data);
- if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
- cpumask_subset(affinity, cpu_online_mask)) {
- raw_spin_unlock(&desc->lock);
- continue;
- }
-
- /*
- * Complete the irq move. This cpu is going down and for
- * non intr-remapping case, we can't wait till this interrupt
- * arrives at this cpu before completing the irq move.
- */
- irq_force_complete_move(desc);
-
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- break_affinity = 1;
- affinity = cpu_online_mask;
- }
-
- chip = irq_data_get_irq_chip(data);
- /*
- * The interrupt descriptor might have been cleaned up
- * already, but it is not yet removed from the radix tree
- */
- if (!chip) {
- raw_spin_unlock(&desc->lock);
- continue;
- }
-
- if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
- chip->irq_mask(data);
-
- if (chip->irq_set_affinity) {
- ret = chip->irq_set_affinity(data, affinity, true);
- if (ret == -ENOSPC)
- pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq);
- } else {
- if (!(warned++))
- set_affinity = 0;
- }
-
- /*
- * We unmask if the irq was not marked masked by the
- * core code. That respects the lazy irq disable
- * behaviour.
- */
- if (!irqd_can_move_in_process_context(data) &&
- !irqd_irq_masked(data) && chip->irq_unmask)
- chip->irq_unmask(data);
-
- raw_spin_unlock(&desc->lock);
-
- if (break_affinity && set_affinity)
- pr_notice("Broke affinity for irq %i\n", irq);
- else if (!set_affinity)
- pr_notice("Cannot set affinity for irq %i\n", irq);
- }
+ irq_migrate_all_off_this_cpu();
/*
* We can remove mdelay() and then send spuriuous interrupts to
@@ -528,8 +456,6 @@ void fixup_irqs(void)
* nothing else will touch it.
*/
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- unsigned int irr;
-
if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 5b2bbfbb3712..6b877807598b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -52,6 +52,7 @@
#include <linux/ftrace.h>
#include <linux/frame.h>
#include <linux/kasan.h>
+#include <linux/moduleloader.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -417,6 +418,14 @@ static void prepare_boost(struct kprobe *p, struct insn *insn)
}
}
+/* Recover page to RW mode before releasing it */
+void free_insn_page(void *page)
+{
+ set_memory_nx((unsigned long)page & PAGE_MASK, 1);
+ set_memory_rw((unsigned long)page & PAGE_MASK, 1);
+ module_memfree(page);
+}
+
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 901c640d152f..69ea0bc1cfa3 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -28,6 +28,7 @@
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
+#include <linux/frame.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -94,6 +95,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
}
asm (
+ "optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
@@ -131,7 +133,12 @@ asm (
" popf\n"
#endif
".global optprobe_template_end\n"
- "optprobe_template_end:\n");
+ "optprobe_template_end:\n"
+ ".type optprobe_template_func, @function\n"
+ ".size optprobe_template_func, .-optprobe_template_func\n");
+
+void optprobe_template_func(void);
+STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_MOVE_IDX \
((long)&optprobe_template_val - (long)&optprobe_template_entry)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index da5c09789984..43e10d6fdbed 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -161,8 +161,8 @@ void kvm_async_pf_task_wait(u32 token)
*/
rcu_irq_exit();
native_safe_halt();
- rcu_irq_enter();
local_irq_disable();
+ rcu_irq_enter();
}
}
if (!n.halted)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d4a15831ac58..a870910c8565 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -22,24 +22,25 @@
#include <asm/syscalls.h>
/* context.lock is held for us, so we don't need any locking. */
-static void flush_ldt(void *current_mm)
+static void flush_ldt(void *__mm)
{
+ struct mm_struct *mm = __mm;
mm_context_t *pc;
- if (current->active_mm != current_mm)
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
- pc = &current->active_mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->size);
+ pc = &mm->context;
+ set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
-static struct ldt_struct *alloc_ldt_struct(unsigned int size)
+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
{
struct ldt_struct *new_ldt;
unsigned int alloc_size;
- if (size > LDT_ENTRIES)
+ if (num_entries > LDT_ENTRIES)
return NULL;
new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL;
BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
- alloc_size = size * LDT_ENTRY_SIZE;
+ alloc_size = num_entries * LDT_ENTRY_SIZE;
/*
* Xen is very picky: it requires a page-aligned LDT that has no
@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL;
}
- new_ldt->size = size;
+ new_ldt->nr_entries = num_entries;
return new_ldt;
}
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
- paravirt_alloc_ldt(ldt->entries, ldt->size);
+ paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}
/* context.lock is held */
@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
- paravirt_free_ldt(ldt->entries, ldt->size);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ paravirt_free_ldt(ldt->entries, ldt->nr_entries);
+ if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree_atomic(ldt->entries);
else
free_page((unsigned long)ldt->entries);
@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
goto out_unlock;
}
- new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
retval = -ENOMEM;
goto out_unlock;
}
memcpy(new_ldt->entries, old_mm->context.ldt->entries,
- new_ldt->size * LDT_ENTRY_SIZE);
+ new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
mm->context.ldt = new_ldt;
@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm)
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int retval;
- unsigned long size;
struct mm_struct *mm = current->mm;
+ unsigned long entries_size;
+ int retval;
mutex_lock(&mm->context.lock);
@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
- size = mm->context.ldt->size * LDT_ENTRY_SIZE;
- if (size > bytecount)
- size = bytecount;
+ entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
+ if (entries_size > bytecount)
+ entries_size = bytecount;
- if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+ if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
- if (size != bytecount) {
+ if (entries_size != bytecount) {
/* Zero-fill the rest and pretend we read bytecount bytes. */
- if (clear_user(ptr + size, bytecount - size)) {
+ if (clear_user(ptr + entries_size, bytecount - entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
struct ldt_struct *new_ldt, *old_ldt;
- unsigned int oldsize, newsize;
+ unsigned int old_nr_entries, new_nr_entries;
struct user_desc ldt_info;
struct desc_struct ldt;
int error;
@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
mutex_lock(&mm->context.lock);
- old_ldt = mm->context.ldt;
- oldsize = old_ldt ? old_ldt->size : 0;
- newsize = max(ldt_info.entry_number + 1, oldsize);
+ old_ldt = mm->context.ldt;
+ old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
+ new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
error = -ENOMEM;
- new_ldt = alloc_ldt_struct(newsize);
+ new_ldt = alloc_ldt_struct(new_nr_entries);
if (!new_ldt)
goto out_unlock;
if (old_ldt)
- memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+ memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
+
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6f5ca4ebe6e5..cb0a30473c23 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
VMCOREINFO_NUMBER(phys_base);
- VMCOREINFO_SYMBOL(init_level4_pgt);
+ VMCOREINFO_SYMBOL(init_top_pgt);
#ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 6d9582ec0324..d27f8d84c4ff 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -78,7 +78,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
/* Don't wait longer than a second */
timeout = USEC_PER_SEC;
- while (!cpumask_empty(mask) && timeout--)
+ while (!cpumask_empty(mask) && --timeout)
udelay(1);
/* What happens if we timeout, do we still unregister?? */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2,
- .read_cr3 = native_read_cr3,
+ .read_cr3 = __native_read_cr3,
.write_cr3 = native_write_cr3,
.flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb88428cbf2..3ca198080ea9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -545,17 +545,6 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
}
/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct inactive_task_frame *frame =
- (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
- return READ_ONCE_NOCHECK(frame->ret_addr);
-}
-
-/*
* Called from fs/proc with a reference on @p to find the function
* which called into schedule(). This needs to be done carefully
* because the task might wake up and we might look at a stack
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ff40e74c9181..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -78,7 +78,7 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
- smp_processor_id());
+ raw_smp_processor_id());
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
+ cr3 = __read_cr3();
cr4 = __read_cr4();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b6840bf3940b..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
+ cr3 = __read_cr3();
cr4 = __read_cr4();
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task)
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt->entries,
- dead_task->mm->context.ldt->size);
+ dead_task->mm->context.ldt->nr_entries);
BUG();
}
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2544700a2a87..67393fc88353 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/tboot.h>
#include <linux/delay.h>
+#include <linux/frame.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -123,6 +124,7 @@ void __noreturn machine_real_restart(unsigned int type)
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(machine_real_restart);
#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
/*
* Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0b4d3c686b1e..65622f07e633 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -503,7 +503,7 @@ static int __init reserve_crashkernel_low(void)
return 0;
}
- low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN);
+ low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN);
if (!low_base) {
pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
(unsigned long)(low_size >> 20));
@@ -980,8 +980,6 @@ void __init setup_arch(char **cmdline_p)
*/
x86_configure_nx();
- simple_udelay_calibration();
-
parse_early_param();
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1041,6 +1039,8 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();
+ simple_udelay_calibration();
+
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f04479a8f74f..b474c8de7fba 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu)
void play_dead_common(void)
{
idle_task_exit();
- reset_lazy_tlbstate();
/* Ack it */
(void)cpu_report_death();
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index f07f83b3611b..5f25cfbd952e 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
mutex_lock(&child->mm->context.lock);
if (unlikely(!child->mm->context.ldt ||
- seg >= child->mm->context.ldt->size))
+ seg >= child->mm->context.ldt->nr_entries))
addr = -1L; /* bogus selector, access would fault */
else {
desc = &child->mm->context.ldt->entries[seg];
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 207b8f2582c7..213ddf3e937d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (end - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -187,7 +187,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4b1724059909..a4eb27918ceb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -514,7 +514,7 @@ int tboot_force_iommu(void)
if (!tboot_enabled())
return 0;
- if (!intel_iommu_tboot_noforce)
+ if (intel_iommu_tboot_noforce)
return 1;
if (no_iommu || swiotlb || dmar_disabled)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index d39c09119db6..e0754cdbad37 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -66,7 +66,7 @@ static struct irqaction irq0 = {
.name = "timer"
};
-void __init setup_default_timer_irq(void)
+static void __init setup_default_timer_irq(void)
{
if (!nr_legacy_irqs())
return;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3995d3a777d4..bf54309b85da 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -182,7 +182,7 @@ int is_valid_bugaddr(unsigned long addr)
return ud == INSN_UD0 || ud == INSN_UD2;
}
-static int fixup_bug(struct pt_regs *regs, int trapnr)
+int fixup_bug(struct pt_regs *regs, int trapnr)
{
if (trapnr != X86_TRAP_UD)
return 0;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 714dfba6a1e7..796d96bb0821 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
-/*
- * Use a ring-buffer like data structure, where a writer advances the head by
- * writing a new data entry and a reader advances the tail when it observes a
- * new entry.
- *
- * Writers are made to wait on readers until there's space to write a new
- * entry.
- *
- * This means that we can always use an {offset, mul} pair to compute a ns
- * value that is 'roughly' in the right direction, even if we're writing a new
- * {offset, mul} pair during the clock read.
- *
- * The down-side is that we can no longer guarantee strict monotonicity anymore
- * (assuming the TSC was that to begin with), because while we compute the
- * intersection point of the two clock slopes and make sure the time is
- * continuous at the point of switching; we can no longer guarantee a reader is
- * strictly before or after the switch point.
- *
- * It does mean a reader no longer needs to disable IRQs in order to avoid
- * CPU-Freq updates messing with his times, and similarly an NMI reader will
- * no longer run the risk of hitting half-written state.
- */
-
struct cyc2ns {
- struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
- struct cyc2ns_data *head; /* 48 + 8 = 56 */
- struct cyc2ns_data *tail; /* 56 + 8 = 64 */
-}; /* exactly fits one cacheline */
-
-static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-
-struct cyc2ns_data *cyc2ns_read_begin(void)
-{
- struct cyc2ns_data *head;
-
- preempt_disable();
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
- head = this_cpu_read(cyc2ns.head);
- /*
- * Ensure we observe the entry when we observe the pointer to it.
- * matches the wmb from cyc2ns_write_end().
- */
- smp_read_barrier_depends();
- head->__count++;
- barrier();
-
- return head;
-}
+}; /* fits one cacheline */
-void cyc2ns_read_end(struct cyc2ns_data *head)
-{
- barrier();
- /*
- * If we're the outer most nested read; update the tail pointer
- * when we're done. This notifies possible pending writers
- * that we've observed the head pointer and that the other
- * entry is now free.
- */
- if (!--head->__count) {
- /*
- * x86-TSO does not reorder writes with older reads;
- * therefore once this write becomes visible to another
- * cpu, we must be finished reading the cyc2ns_data.
- *
- * matches with cyc2ns_write_begin().
- */
- this_cpu_write(cyc2ns.tail, head);
- }
- preempt_enable();
-}
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-/*
- * Begin writing a new @data entry for @cpu.
- *
- * Assumes some sort of write side lock; currently 'provided' by the assumption
- * that cpufreq will call its notifiers sequentially.
- */
-static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+void cyc2ns_read_begin(struct cyc2ns_data *data)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
- struct cyc2ns_data *data = c2n->data;
+ int seq, idx;
- if (data == c2n->head)
- data++;
+ preempt_disable_notrace();
- /* XXX send an IPI to @cpu in order to guarantee a read? */
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
- /*
- * When we observe the tail write from cyc2ns_read_end(),
- * the cpu must be done with that entry and its safe
- * to start writing to it.
- */
- while (c2n->tail == data)
- cpu_relax();
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
- return data;
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+void cyc2ns_read_end(void)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- /*
- * Ensure the @data writes are visible before we publish the
- * entry. Matches the data-depencency in cyc2ns_read_begin().
- */
- smp_wmb();
-
- ACCESS_ONCE(c2n->head) = data;
+ preempt_enable_notrace();
}
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
- data->__count = 0;
}
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
- c2n->head = c2n->data;
- c2n->tail = c2n->data;
+ seqcount_init(&c2n->seq);
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data, *tail;
+ struct cyc2ns_data data;
unsigned long long ns;
- /*
- * See cyc2ns_read_*() for details; replicated in order to avoid
- * an extra few instructions that came with the abstraction.
- * Notable, it allows us to only do the __count and tail update
- * dance when its actually needed.
- */
-
- preempt_disable_notrace();
- data = this_cpu_read(cyc2ns.head);
- tail = this_cpu_read(cyc2ns.tail);
-
- if (likely(data == tail)) {
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
- } else {
- data->__count++;
-
- barrier();
+ cyc2ns_read_begin(&data);
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- barrier();
-
- if (!--data->__count)
- this_cpu_write(cyc2ns.tail, data);
- }
- preempt_enable_notrace();
+ cyc2ns_read_end();
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
- unsigned long long tsc_now, ns_now;
- struct cyc2ns_data *data;
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
unsigned long flags;
local_irq_save(flags);
@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
- data = cyc2ns_write_begin(cpu);
-
- tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
*/
- if (data->cyc2ns_shift == 32) {
- data->cyc2ns_shift = 31;
- data->cyc2ns_mul >>= 1;
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
}
- data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
- cyc2ns_write_end(cpu, data);
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
done:
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
return 1;
}
@@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
-
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
@@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
}
return 0;
@@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = {
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
};
void mark_tsc_unstable(char *reason)
@@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
+ int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
@@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Inform the TSC deadline clockevent devices about the recalibration */
lapic_update_tsc_freq();
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
- u64 lpj;
+ u64 lpj, cyc;
int cpu;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
@@ -1390,9 +1305,10 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
+ cyc = rdtsc();
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu, cyc);
}
if (tsc_disabled > 0)
@@ -1412,11 +1328,11 @@ void __init tsc_init(void)
use_tsc_delay();
+ check_system_tsc_reliable();
+
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
- check_system_tsc_reliable();
-
detect_art();
}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 728f75378475..7842371bc9e4 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -71,13 +71,8 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
* non zero. We don't do that on non boot cpus because physical
* hotplug should have set the ADJUST register to a value > 0 so
* the TSC is in sync with the already running cpus.
- *
- * But we always force positive ADJUST values. Otherwise the TSC
- * deadline timer creates an interrupt storm. We also have to
- * prevent values > 0x7FFFFFFF as those wreckage the timer as well.
*/
- if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) ||
- (bootval > 0x7FFFFFFF)) {
+ if (bootcpu && bootval != 0) {
pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu,
bootval);
wrmsrl(MSR_IA32_TSC_ADJUST, 0);
@@ -451,20 +446,6 @@ retry:
*/
cur->adjusted += cur_max_warp;
- /*
- * TSC deadline timer stops working or creates an interrupt storm
- * with adjust values < 0 and > x07ffffff.
- *
- * To allow adjust values > 0x7FFFFFFF we need to disable the
- * deadline timer and use the local APIC timer, but that requires
- * more intrusive changes and we do not have any useful information
- * from Intel about the underlying HW wreckage yet.
- */
- if (cur->adjusted < 0)
- cur->adjusted = 0;
- if (cur->adjusted > 0x7FFFFFFF)
- cur->adjusted = 0x7FFFFFFF;
-
pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
cpu, cur_max_warp, cur->adjusted);
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 82c6d7f1fd73..b9389d72b2f7 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -104,6 +104,11 @@ static inline unsigned long *last_frame(struct unwind_state *state)
return (unsigned long *)task_pt_regs(state->task) - 2;
}
+static bool is_last_frame(struct unwind_state *state)
+{
+ return state->bp == last_frame(state);
+}
+
#ifdef CONFIG_X86_32
#define GCC_REALIGN_WORDS 3
#else
@@ -115,16 +120,15 @@ static inline unsigned long *last_aligned_frame(struct unwind_state *state)
return last_frame(state) - GCC_REALIGN_WORDS;
}
-static bool is_last_task_frame(struct unwind_state *state)
+static bool is_last_aligned_frame(struct unwind_state *state)
{
unsigned long *last_bp = last_frame(state);
unsigned long *aligned_bp = last_aligned_frame(state);
/*
- * We have to check for the last task frame at two different locations
- * because gcc can occasionally decide to realign the stack pointer and
- * change the offset of the stack frame in the prologue of a function
- * called by head/entry code. Examples:
+ * GCC can occasionally decide to realign the stack pointer and change
+ * the offset of the stack frame in the prologue of a function called
+ * by head/entry code. Examples:
*
* <start_secondary>:
* push %edi
@@ -141,11 +145,38 @@ static bool is_last_task_frame(struct unwind_state *state)
* push %rbp
* mov %rsp,%rbp
*
- * Note that after aligning the stack, it pushes a duplicate copy of
- * the return address before pushing the frame pointer.
+ * After aligning the stack, it pushes a duplicate copy of the return
+ * address before pushing the frame pointer.
+ */
+ return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1));
+}
+
+static bool is_last_ftrace_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *last_ftrace_bp = last_bp - 3;
+
+ /*
+ * When unwinding from an ftrace handler of a function called by entry
+ * code, the stack layout of the last frame is:
+ *
+ * bp
+ * parent ret addr
+ * bp
+ * function ret addr
+ * parent ret addr
+ * pt_regs
+ * -----------------
*/
- return (state->bp == last_bp ||
- (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1)));
+ return (state->bp == last_ftrace_bp &&
+ *state->bp == *(state->bp + 2) &&
+ *(state->bp + 1) == *(state->bp + 4));
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ return is_last_frame(state) || is_last_aligned_frame(state) ||
+ is_last_ftrace_frame(state);
}
/*