summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c4
-rw-r--r--arch/x86/kernel/amd_iommu_init.c67
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c13
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c10
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c18
-rw-r--r--arch/x86/kernel/cpu/intel.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c13
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event.c71
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c96
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c15
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/hpet.c33
-rw-r--r--arch/x86/kernel/hw_breakpoint.c40
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kprobes.c25
-rw-r--r--arch/x86/kernel/module.c3
-rw-r--r--arch/x86/kernel/mpparse.c16
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/setup.c14
-rw-r--r--arch/x86/kernel/smpboot.c53
-rw-r--r--arch/x86/kernel/sys_i386_32.c4
-rw-r--r--arch/x86/kernel/tlb_uv.c760
-rw-r--r--arch/x86/kernel/trampoline.c17
-rw-r--r--arch/x86/kernel/tsc.c38
-rw-r--r--arch/x86/kernel/vmi_32.c893
-rw-r--r--arch/x86/kernel/vmiclock_32.c317
41 files changed, 940 insertions, 1654 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index dd9a2e459c78..96abc8fa56b1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -11,6 +11,8 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
+CFLAGS_REMOVE_pvclock.o = -pg
+CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
endif
@@ -91,7 +93,6 @@ obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
-obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb7a5f052e2b..fb16f17e59be 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -61,7 +61,7 @@ struct cstate_entry {
unsigned int ecx;
} states[ACPI_PROCESSOR_MAX_POWER];
};
-static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */
+static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index fa044e1e30a2..679b6450382b 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1953,6 +1953,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
size_t size,
int dir)
{
+ dma_addr_t flush_addr;
dma_addr_t i, start;
unsigned int pages;
@@ -1960,6 +1961,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
(dma_addr + size > dma_dom->aperture_size))
return;
+ flush_addr = dma_addr;
pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
dma_addr &= PAGE_MASK;
start = dma_addr;
@@ -1974,7 +1976,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
dma_ops_free_addresses(dma_dom, dma_addr, pages);
if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- iommu_flush_pages(&dma_dom->domain, dma_addr, size);
+ iommu_flush_pages(&dma_dom->domain, flush_addr, size);
dma_dom->need_flush = false;
}
}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3cc63e2b8dd4..5a170cbbbed8 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -632,6 +632,13 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
iommu->last_device = calc_devid(MMIO_GET_BUS(range),
MMIO_GET_LD(range));
iommu->evt_msi_num = MMIO_MSI_NUM(misc);
+
+ if (is_rd890_iommu(iommu->dev)) {
+ pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
+ pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
+ pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
+ pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
+ }
}
/*
@@ -649,29 +656,9 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
struct ivhd_entry *e;
/*
- * First set the recommended feature enable bits from ACPI
- * into the IOMMU control registers
+ * First save the recommended feature enable bits from ACPI
*/
- h->flags & IVHD_FLAG_HT_TUN_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
- iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
- h->flags & IVHD_FLAG_PASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
- h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
- h->flags & IVHD_FLAG_ISOC_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
- iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
- /*
- * make IOMMU memory accesses cache coherent
- */
- iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+ iommu->acpi_flags = h->flags;
/*
* Done. Now parse the device entries
@@ -1116,6 +1103,40 @@ static void init_device_table(void)
}
}
+static void iommu_init_flags(struct amd_iommu *iommu)
+{
+ iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+ iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+ iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+ iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+ iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+ iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+ /*
+ * make IOMMU memory accesses cache coherent
+ */
+ iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+}
+
+static void iommu_apply_quirks(struct amd_iommu *iommu)
+{
+ if (is_rd890_iommu(iommu->dev)) {
+ pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
+ pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
+ pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
+ pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
+ }
+}
+
/*
* This function finally enables all IOMMUs found in the system after
* they have been initialized
@@ -1126,6 +1147,8 @@ static void enable_iommus(void)
for_each_iommu(iommu) {
iommu_disable(iommu);
+ iommu_apply_quirks(iommu);
+ iommu_init_flags(iommu);
iommu_set_device_table(iommu);
iommu_enable_command_buffer(iommu);
iommu_enable_event_buffer(iommu);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 980508c79082..e3b534cda49a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void)
* acpi lapic path already maps that address in
* acpi_register_lapic_address()
*/
- if (!acpi_lapic)
+ if (!acpi_lapic && !smp_found_config)
set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8884928d7bc1..f1e78940d908 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -306,14 +306,19 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
old_cfg = old_desc->chip_data;
- memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+ cfg->vector = old_cfg->vector;
+ cfg->move_in_progress = old_cfg->move_in_progress;
+ cpumask_copy(cfg->domain, old_cfg->domain);
+ cpumask_copy(cfg->old_domain, old_cfg->old_domain);
init_copy_irq_2_pin(old_cfg, cfg, node);
}
-static void free_irq_cfg(struct irq_cfg *old_cfg)
+static void free_irq_cfg(struct irq_cfg *cfg)
{
- kfree(old_cfg);
+ free_cpumask_var(cfg->domain);
+ free_cpumask_var(cfg->old_domain);
+ kfree(cfg);
}
void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -1728,6 +1733,8 @@ __apicdebuginit(void) print_IO_APIC(void)
struct irq_pin_list *entry;
cfg = desc->chip_data;
+ if (!cfg)
+ continue;
entry = cfg->irq_2_pin;
if (!entry)
continue;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e46f98f36e31..f744f54cb248 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
{
if (reason != DIE_NMI_IPI)
return NOTIFY_OK;
+
+ if (in_crash_kexec)
+ /* do nothing if entering the crash kernel */
+ return NOTIFY_OK;
/*
* Use a lock so only one cpu prints at a time
* to prevent intermixed output.
@@ -694,9 +698,11 @@ void __init uv_system_init(void)
for (j = 0; j < 64; j++) {
if (!test_bit(j, &present))
continue;
- uv_blade_info[blade].pnode = (i * 64 + j);
+ pnode = (i * 64 + j);
+ uv_blade_info[blade].pnode = pnode;
uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
+ max_pnode = max(pnode, max_pnode);
blade++;
}
}
@@ -734,7 +740,6 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
- max_pnode = max(pnode, max_pnode);
}
/* Add blade/pnode info for nodes without cpus */
@@ -746,7 +751,6 @@ void __init uv_system_init(void)
pnode = (paddr >> m_val) & pnode_mask;
blade = boot_pnode_to_blade(pnode);
uv_node_to_blade[nid] = blade;
- max_pnode = max(pnode, max_pnode);
}
map_gru_high(max_pnode);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3a7c852f021d..a8b4d91b8394 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -669,7 +669,7 @@ bool cpu_has_amd_erratum(const int *erratum)
}
/* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 8) | cpu->x86_mask;
+ ms = (cpu->x86_model << 4) | cpu->x86_mask;
while ((range = *erratum++))
if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
(ms >= AMD_MODEL_RANGE_START(range)) &&
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 787b3c7c6625..15c671385f59 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -545,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 tfms, xlvl;
u32 ebx;
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3624e8a0f71b..f668bb1f7d43 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -33,5 +33,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 246cd3afbb5f..cd8da247dda1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -72,7 +72,7 @@ struct acpi_cpufreq_data {
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance *acpi_perf_data;
+static struct acpi_processor_performance __percpu *acpi_perf_data;
static struct cpufreq_driver acpi_cpufreq_driver;
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 994230d4dc4e..4f6f679f2799 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -368,16 +368,22 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
return -ENODEV;
out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_BUFFER)
- return -ENODEV;
+ if (out_obj->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
- if (errors)
- return -ENODEV;
+ if (errors) {
+ ret = -ENODEV;
+ goto out_free;
+ }
supported = *((u32 *)(out_obj->buffer.pointer + 4));
- if (!(supported & 0x1))
- return -ENODEV;
+ if (!(supported & 0x1)) {
+ ret = -ENODEV;
+ goto out_free;
+ }
out_free:
kfree(output.pointer);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3a683ea5267e..695f17731e23 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
c->cpuid_level = cpuid_eax(0);
+ get_cpu_cap(c);
}
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 745b54f9be89..8209472b27a5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -80,7 +80,7 @@ int apei_write_mce(struct mce *m)
rcd.hdr.revision = CPER_RECORD_REV;
rcd.hdr.signature_end = CPER_SIG_END;
rcd.hdr.section_count = 1;
- rcd.hdr.error_severity = CPER_SER_FATAL;
+ rcd.hdr.error_severity = CPER_SEV_FATAL;
/* timestamp, platform_id, partition_id are all invalid */
rcd.hdr.validation_bits = 0;
rcd.hdr.record_length = sizeof(rcd);
@@ -96,7 +96,7 @@ int apei_write_mce(struct mce *m)
rcd.sec_hdr.validation_bits = 0;
rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
- rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+ rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
memcpy(&rcd.mce, m, sizeof(*m));
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 224392d8fe8c..39aaee5c1ab2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -141,6 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
+
address += MCG_XBLK_ADDR;
} else
++address;
@@ -148,12 +149,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (rdmsr_safe(address, &low, &high))
break;
- if (!(high & MASK_VALID_HI)) {
- if (block)
- continue;
- else
- break;
- }
+ if (!(high & MASK_VALID_HI))
+ continue;
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
@@ -530,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
err = -ENOMEM;
goto out;
}
- if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
kfree(b);
err = -ENOMEM;
goto out;
@@ -543,7 +540,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
#ifndef CONFIG_SMP
cpumask_setall(b->cpus);
#else
- cpumask_copy(b->cpus, c->llc_shared_map);
+ cpumask_set_cpu(cpu, b->cpus);
#endif
per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index c2a8b26d4fea..169d8804a9f8 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -202,10 +202,11 @@ static int therm_throt_process(bool new_event, int event, int level)
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
+static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+ unsigned int cpu)
{
int err;
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
if (err)
@@ -215,7 +216,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
err = sysfs_add_file_to_group(&sys_dev->kobj,
&attr_core_power_limit_count.attr,
thermal_attr_group.name);
- if (cpu_has(c, X86_FEATURE_PTS))
+ if (cpu_has(c, X86_FEATURE_PTS)) {
err = sysfs_add_file_to_group(&sys_dev->kobj,
&attr_package_throttle_count.attr,
thermal_attr_group.name);
@@ -223,6 +224,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
err = sysfs_add_file_to_group(&sys_dev->kobj,
&attr_package_power_limit_count.attr,
thermal_attr_group.name);
+ }
return err;
}
@@ -251,7 +253,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
mutex_lock(&therm_cpu_lock);
- err = thermal_throttle_add_dev(sys_dev);
+ err = thermal_throttle_add_dev(sys_dev, cpu);
mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
@@ -287,7 +289,7 @@ static __init int thermal_throttle_init_device(void)
#endif
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_sysdev(cpu));
+ err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
WARN_ON(err);
}
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f2da20fda02d..03a5b0385ad6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -102,6 +102,7 @@ struct cpu_hw_events {
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
int n_events;
@@ -1010,6 +1011,7 @@ static int x86_pmu_start(struct perf_event *event)
x86_perf_event_set_period(event);
cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
+ __set_bit(idx, cpuc->running);
x86_pmu.enable(event);
perf_event_update_userpage(event);
@@ -1141,8 +1143,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- if (!test_bit(idx, cpuc->active_mask))
+ if (!test_bit(idx, cpuc->active_mask)) {
+ /*
+ * Though we deactivated the counter some cpus
+ * might still deliver spurious interrupts still
+ * in flight. Catch them:
+ */
+ if (__test_and_clear_bit(idx, cpuc->running))
+ handled++;
continue;
+ }
event = cpuc->events[idx];
hwc = &event->hw;
@@ -1154,7 +1164,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
/*
* event overflow
*/
- handled = 1;
+ handled++;
data.period = event->hw.last_period;
if (!x86_perf_event_set_period(event))
@@ -1200,12 +1210,20 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
+struct pmu_nmi_state {
+ unsigned int marked;
+ int handled;
+};
+
+static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
+
static int __kprobes
perf_event_nmi_handler(struct notifier_block *self,
unsigned long cmd, void *__args)
{
struct die_args *args = __args;
- struct pt_regs *regs;
+ unsigned int this_nmi;
+ int handled;
if (!atomic_read(&active_events))
return NOTIFY_DONE;
@@ -1214,22 +1232,47 @@ perf_event_nmi_handler(struct notifier_block *self,
case DIE_NMI:
case DIE_NMI_IPI:
break;
-
+ case DIE_NMIUNKNOWN:
+ this_nmi = percpu_read(irq_stat.__nmi_count);
+ if (this_nmi != __get_cpu_var(pmu_nmi).marked)
+ /* let the kernel handle the unknown nmi */
+ return NOTIFY_DONE;
+ /*
+ * This one is a PMU back-to-back nmi. Two events
+ * trigger 'simultaneously' raising two back-to-back
+ * NMIs. If the first NMI handles both, the latter
+ * will be empty and daze the CPU. So, we drop it to
+ * avoid false-positive 'unknown nmi' messages.
+ */
+ return NOTIFY_STOP;
default:
return NOTIFY_DONE;
}
- regs = args->regs;
-
apic_write(APIC_LVTPC, APIC_DM_NMI);
- /*
- * Can't rely on the handled return value to say it was our NMI, two
- * events could trigger 'simultaneously' raising two back-to-back NMIs.
- *
- * If the first NMI handles both, the latter will be empty and daze
- * the CPU.
- */
- x86_pmu.handle_irq(regs);
+
+ handled = x86_pmu.handle_irq(args->regs);
+ if (!handled)
+ return NOTIFY_DONE;
+
+ this_nmi = percpu_read(irq_stat.__nmi_count);
+ if ((handled > 1) ||
+ /* the next nmi could be a back-to-back nmi */
+ ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
+ (__get_cpu_var(pmu_nmi).handled > 1))) {
+ /*
+ * We could have two subsequent back-to-back nmis: The
+ * first handles more than one counter, the 2nd
+ * handles only one counter and the 3rd handles no
+ * counter.
+ *
+ * This is the 2nd nmi because the previous was
+ * handling more than one counter. We will mark the
+ * next (3rd) and then drop it if unhandled.
+ */
+ __get_cpu_var(pmu_nmi).marked = this_nmi + 1;
+ __get_cpu_var(pmu_nmi).handled = handled;
+ }
return NOTIFY_STOP;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 214ac860ebe0..ee05c90012d2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -491,33 +491,78 @@ static void intel_pmu_enable_all(int added)
* Intel Errata AAP53 (model 30)
* Intel Errata BD53 (model 44)
*
- * These chips need to be 'reset' when adding counters by programming
- * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
- * either in sequence on the same PMC or on different PMCs.
+ * The official story:
+ * These chips need to be 'reset' when adding counters by programming the
+ * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ * in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
*/
-static void intel_pmu_nhm_enable_all(int added)
+static void intel_pmu_nhm_workaround(void)
{
- if (added) {
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int i;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ static const unsigned long nhm_magic[4] = {
+ 0x4300B5,
+ 0x4300D2,
+ 0x4300B1,
+ 0x4300B1
+ };
+ struct perf_event *event;
+ int i;
+
+ /*
+ * The Errata requires below steps:
+ * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+ * 2) Configure 4 PERFEVTSELx with the magic events and clear
+ * the corresponding PMCx;
+ * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+ * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+ * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+ */
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+ /*
+ * The real steps we choose are a little different from above.
+ * A) To reduce MSR operations, we don't run step 1) as they
+ * are already cleared before this function is called;
+ * B) Call x86_perf_event_update to save PMCx before configuring
+ * PERFEVTSELx with magic number;
+ * C) With step 5), we do clear only when the PERFEVTSELx is
+ * not used currently.
+ * D) Call x86_perf_event_set_period to restore PMCx;
+ */
+
+ /* We always operate 4 pairs of PERF Counters */
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+ if (event)
+ x86_perf_event_update(event);
+ }
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+ for (i = 0; i < 4; i++) {
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+ wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+ }
- for (i = 0; i < 3; i++) {
- struct perf_event *event = cpuc->events[i];
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
- if (!event)
- continue;
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+ if (event) {
+ x86_perf_event_set_period(event);
__x86_pmu_enable_event(&event->hw,
- ARCH_PERFMON_EVENTSEL_ENABLE);
- }
+ ARCH_PERFMON_EVENTSEL_ENABLE);
+ } else
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
}
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+ if (added)
+ intel_pmu_nhm_workaround();
intel_pmu_enable_all(added);
}
@@ -667,7 +712,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
int bit, loops;
- u64 ack, status;
+ u64 status;
+ int handled = 0;
perf_sample_data_init(&data, 0);
@@ -683,6 +729,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
loops = 0;
again:
+ intel_pmu_ack_status(status);
if (++loops > 100) {
WARN_ONCE(1, "perfevents: irq loop stuck!\n");
perf_event_print_debug();
@@ -691,19 +738,22 @@ again:
}
inc_irq_stat(apic_perf_irqs);
- ack = status;
intel_pmu_lbr_read();
/*
* PEBS overflow sets bit 62 in the global status register
*/
- if (__test_and_clear_bit(62, (unsigned long *)&status))
+ if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+ handled++;
x86_pmu.drain_pebs(regs);
+ }
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];
+ handled++;
+
if (!test_bit(bit, cpuc->active_mask))
continue;
@@ -716,8 +766,6 @@ again:
x86_pmu_stop(event);
}
- intel_pmu_ack_status(ack);
-
/*
* Repeat if there is more work to be done:
*/
@@ -727,7 +775,7 @@ again:
done:
intel_pmu_enable_all(0);
- return 1;
+ return handled;
}
static struct event_constraint *
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 107711bf0ee8..249015173992 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -497,6 +497,8 @@ static int p4_hw_config(struct perf_event *event)
event->hw.config |= event->attr.config &
(p4_config_pack_escr(P4_ESCR_MASK_HT) |
p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
+
+ event->hw.config &= ~P4_CCCR_FORCE_OVF;
}
rc = x86_setup_perfctr(event);
@@ -656,9 +658,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ int overflow;
- if (!test_bit(idx, cpuc->active_mask))
+ if (!test_bit(idx, cpuc->active_mask)) {
+ /* catch in-flight IRQs */
+ if (__test_and_clear_bit(idx, cpuc->running))
+ handled++;
continue;
+ }
event = cpuc->events[idx];
hwc = &event->hw;
@@ -666,12 +673,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
WARN_ON_ONCE(hwc->idx != idx);
/* it might be unflagged overflow */
- handled = p4_pmu_clear_cccr_ovf(hwc);
+ overflow = p4_pmu_clear_cccr_ovf(hwc);
val = x86_perf_event_update(event);
- if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+ if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
continue;
+ handled += overflow;
+
/* event overflow for sure */
data.period = event->hw.last_period;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 34b4dad6f0b8..d49079515122 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,6 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
const struct cpuid_bit *cb;
static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
+ { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
{ X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ebd4c51d096a..764c7c2b1811 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,6 +28,8 @@
#include <asm/reboot.h>
#include <asm/virtext.h>
+int in_crash_kexec;
+
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct die_args *args)
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
static void kdump_nmi_shootdown_cpus(void)
{
+ in_crash_kexec = 1;
nmi_shootdown_cpus(kdump_nmi_callback);
disable_local_APIC();
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index e4bd78c160d8..76b8cd953dee 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,7 +18,6 @@
#include <asm/apic.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/hpet.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -190,21 +189,6 @@ static void __init ati_bugs_contd(int num, int slot, int func)
}
#endif
-/*
- * Force the read back of the CMP register in hpet_next_event()
- * to work around the problem that the CMP register write seems to be
- * delayed. See hpet_next_event() for details.
- *
- * We do this on all SMBUS incarnations for now until we have more
- * information about the affected chipsets.
- */
-static void __init ati_hpet_bugs(int num, int slot, int func)
-{
-#ifdef CONFIG_HPET_TIMER
- hpet_readback_cmp = 1;
-#endif
-}
-
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -234,8 +218,6 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
- { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
- PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs },
{}
};
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c5ea5cdbe7b3..17be5ec7cbba 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper)
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
*
* C extern interface:
- * extern long execve(char *name, char **argv, char **envp)
+ * extern long execve(const char *name, char **argv, char **envp)
*
* asm input arguments:
* rdi: name, rsi: argv, rdx: envp
*
* We want to fallback into:
- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
*
* do_sys_execve asm fallback arguments:
* rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ff4c453e13f3..fa8c1b8e09fb 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -334,7 +334,7 @@ ENTRY(startup_32_smp)
/*
* Enable paging
*/
- movl $pa(swapper_pg_dir),%eax
+ movl pa(initial_page_table), %eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
orl $X86_CR0_PG,%eax
@@ -614,6 +614,8 @@ ignore_int:
.align 4
ENTRY(initial_code)
.long i386_start_kernel
+ENTRY(initial_page_table)
+ .long pa(swapper_pg_dir)
/*
* BSS section
@@ -629,6 +631,10 @@ ENTRY(swapper_pg_dir)
#endif
swapper_pg_fixmap:
.fill 1024,4,0
+#ifdef CONFIG_X86_TRAMPOLINE
+ENTRY(trampoline_pg_dir)
+ .fill 1024,4,0
+#endif
ENTRY(empty_zero_page)
.fill 4096,1,0
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 351f9c0fea1f..7494999141b3 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -35,7 +35,6 @@
unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
u8 hpet_msi_disable;
-u8 hpet_readback_cmp;
#ifdef CONFIG_PCI_MSI
static unsigned long hpet_num_timers;
@@ -395,23 +394,27 @@ static int hpet_next_event(unsigned long delta,
* at that point and we would wait for the next hpet interrupt
* forever. We found out that reading the CMP register back
* forces the transfer so we can rely on the comparison with
- * the counter register below.
+ * the counter register below. If the read back from the
+ * compare register does not match the value we programmed
+ * then we might have a real hardware problem. We can not do
+ * much about it here, but at least alert the user/admin with
+ * a prominent warning.
*
- * That works fine on those ATI chipsets, but on newer Intel
- * chipsets (ICH9...) this triggers due to an erratum: Reading
- * the comparator immediately following a write is returning
- * the old value.
+ * An erratum on some chipsets (ICH9,..), results in
+ * comparator read immediately following a write returning old
+ * value. Workaround for this is to read this value second
+ * time, when first read returns old value.
*
- * We restrict the read back to the affected ATI chipsets (set
- * by quirks) and also run it with hpet=verbose for debugging
- * purposes.
+ * In fact the write to the comparator register is delayed up
+ * to two HPET cycles so the workaround we tried to restrict
+ * the readback to those known to be borked ATI chipsets
+ * failed miserably. So we give up on optimizations forever
+ * and penalize all HPET incarnations unconditionally.
*/
- if (hpet_readback_cmp || hpet_verbose) {
- u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
-
- if (cmp != cnt)
+ if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
+ if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
printk_once(KERN_WARNING
- "hpet: compare register read back failed.\n");
+ "hpet: compare register read back failed.\n");
}
return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
@@ -503,7 +506,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
{
unsigned int irq;
- irq = create_irq();
+ irq = create_irq_nr(0, -1);
if (!irq)
return -EINVAL;
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a474ec37c32f..ff15c9dcc25d 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -206,11 +206,27 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
int arch_bp_generic_fields(int x86_len, int x86_type,
int *gen_len, int *gen_type)
{
- /* Len */
- switch (x86_len) {
- case X86_BREAKPOINT_LEN_X:
+ /* Type */
+ switch (x86_type) {
+ case X86_BREAKPOINT_EXECUTE:
+ if (x86_len != X86_BREAKPOINT_LEN_X)
+ return -EINVAL;
+
+ *gen_type = HW_BREAKPOINT_X;
*gen_len = sizeof(long);
+ return 0;
+ case X86_BREAKPOINT_WRITE:
+ *gen_type = HW_BREAKPOINT_W;
break;
+ case X86_BREAKPOINT_RW:
+ *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Len */
+ switch (x86_len) {
case X86_BREAKPOINT_LEN_1:
*gen_len = HW_BREAKPOINT_LEN_1;
break;
@@ -229,21 +245,6 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
return -EINVAL;
}
- /* Type */
- switch (x86_type) {
- case X86_BREAKPOINT_EXECUTE:
- *gen_type = HW_BREAKPOINT_X;
- break;
- case X86_BREAKPOINT_WRITE:
- *gen_type = HW_BREAKPOINT_W;
- break;
- case X86_BREAKPOINT_RW:
- *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
- break;
- default:
- return -EINVAL;
- }
-
return 0;
}
@@ -316,9 +317,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
ret = -EINVAL;
switch (info->len) {
- case X86_BREAKPOINT_LEN_X:
- align = sizeof(long) -1;
- break;
case X86_BREAKPOINT_LEN_1:
align = 0;
break;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 1f11f5ce668f..a46cb3522c0c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -40,6 +40,7 @@
static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
unsigned int xstate_size;
+EXPORT_SYMBOL_GPL(xstate_size);
unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
static struct i387_fxsave_struct fx_scratch __cpuinitdata;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ef10940e1af0..852b81967a37 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -194,7 +194,7 @@ static struct hw_breakpoint {
unsigned long addr;
int len;
int type;
- struct perf_event **pev;
+ struct perf_event * __percpu *pev;
} breakinfo[HBP_NUM];
static unsigned long early_dr7;
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1bfb6cf4dd55..770ebfb349e9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -709,6 +709,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
struct hlist_node *node, *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+ kprobe_opcode_t *correct_ret_addr = NULL;
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
@@ -740,14 +741,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
/* another task is sharing our hash bucket */
continue;
+ orig_ret_address = (unsigned long)ri->ret_addr;
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+ correct_ret_addr = ri->ret_addr;
+ hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
__get_cpu_var(current_kprobe) = &ri->rp->kp;
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+ ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
__get_cpu_var(current_kprobe) = NULL;
}
- orig_ret_address = (unsigned long)ri->ret_addr;
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
@@ -759,8 +780,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
break;
}
- kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
kretprobe_hash_unlock(current, &flags);
hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e0bc186d7501..1c355c550960 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -239,11 +239,10 @@ int module_finalize(const Elf_Ehdr *hdr,
apply_paravirt(pseg, pseg + para->sh_size);
}
- return module_bug_finalize(hdr, sechdrs, me);
+ return 0;
}
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
- module_bug_cleanup(mod);
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54be..d7b6f7fb4fec 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+static void __init smp_register_lapic_address(unsigned long address)
+{
+ mp_lapic_addr = address;
+
+ set_fixmap_nocache(FIX_APIC_BASE, address);
+ if (boot_cpu_physical_apicid == -1U) {
+ boot_cpu_physical_apicid = read_apic_id();
+ apic_version[boot_cpu_physical_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
@@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (early)
return 1;
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ smp_register_lapic_address(mpc->lapic);
+
if (mpc->oemptr)
x86_init.mpparse.smp_read_mpc_oem(mpc);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c01..c5b250011fd4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
- .alloc_pmd_clone = paravirt_nop,
.alloc_pud = paravirt_nop,
.release_pte = paravirt_nop,
.release_pmd = paravirt_nop,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d401f1d2d06e..57d1868a86aa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -301,8 +301,9 @@ EXPORT_SYMBOL(kernel_thread);
/*
* sys_execve() executes a new program.
*/
-long sys_execve(char __user *name, char __user * __user *argv,
- char __user * __user *envp, struct pt_regs *regs)
+long sys_execve(const char __user *name,
+ const char __user *const __user *argv,
+ const char __user *const __user *envp, struct pt_regs *regs)
{
long error;
char *filename;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index dede5c4bae4b..322b24fbeafd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -83,7 +83,6 @@
#include <asm/dmi.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
-#include <asm/vmi.h>
#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
#include <asm/cacheflush.h>
@@ -733,10 +732,10 @@ void __init setup_arch(char **cmdline_p)
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
- /* VMI may relocate the fixmap; do this before touching ioremap area */
- vmi_init();
-
- /* OFW also may relocate the fixmap */
+ /*
+ * If we have OLPC OFW, we might end up relocating the fixmap due to
+ * reserve_top(), so do this before touching the ioremap area.
+ */
olpc_ofw_detect();
early_trap_init();
@@ -837,9 +836,6 @@ void __init setup_arch(char **cmdline_p)
x86_report_nx();
- /* Must be before kernel pagetables are setup */
- vmi_activate();
-
/* after early param, so could get panic from serial */
reserve_early_setup_data();
@@ -1013,6 +1009,8 @@ void __init setup_arch(char **cmdline_p)
paging_init();
x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+ setup_trampoline_page_table();
+
tboot_probe();
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a5e928b0cb5f..63a1a5596ac0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,6 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
-#include <asm/vmi.h>
#include <asm/apic.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
@@ -73,7 +72,6 @@
#ifdef CONFIG_X86_32
u8 apicid_2_node[MAX_APICID];
-static int low_mappings;
#endif
/* State of each CPU */
@@ -91,6 +89,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
+
+/*
+ * We need this for trampoline_base protection from concurrent accesses when
+ * off- and onlining cores wildly.
+ */
+static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
+
+void cpu_hotplug_driver_lock()
+{
+ mutex_lock(&x86_cpu_hotplug_driver_mutex);
+}
+
+void cpu_hotplug_driver_unlock()
+{
+ mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+}
+
+ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
+ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
#else
static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
#define get_idle_for_cpu(x) (idle_thread_array[(x)])
@@ -281,7 +298,18 @@ notrace static void __cpuinit start_secondary(void *unused)
* fragile that we want to limit the things done here to the
* most necessary things.
*/
- vmi_bringup();
+
+#ifdef CONFIG_X86_32
+ /*
+ * Switch away from the trampoline page-table
+ *
+ * Do this before cpu_init() because it needs to access per-cpu
+ * data which may not be mapped in the trampoline page-table.
+ */
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+#endif
+
cpu_init();
preempt_disable();
smp_callin();
@@ -299,12 +327,6 @@ notrace static void __cpuinit start_secondary(void *unused)
legacy_pic->chip->unmask(0);
}
-#ifdef CONFIG_X86_32
- while (low_mappings)
- cpu_relax();
- __flush_tlb_all();
-#endif
-
/* This must be done before setting cpu_online_mask */
set_cpu_sibling_map(raw_smp_processor_id());
wmb();
@@ -750,6 +772,7 @@ do_rest:
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
+ initial_page_table = __pa(&trampoline_pg_dir);
#else
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
@@ -897,20 +920,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-#ifdef CONFIG_X86_32
- /* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
- flush_tlb_all();
- low_mappings = 1;
-
err = do_boot_cpu(apicid, cpu);
- zap_low_mappings(false);
- low_mappings = 0;
-#else
- err = do_boot_cpu(apicid, cpu);
-#endif
if (err) {
pr_debug("do_boot_cpu failed %d\n", err);
return -EIO;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 196552bb412c..d5e06624e34a 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -28,7 +28,9 @@
* Do a system call from kernel instead of calling sys_execve so we
* end up with proper pt_regs.
*/
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+ const char *const argv[],
+ const char *const envp[])
{
long __res;
asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 7fea555929e2..312ef0292815 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -8,6 +8,7 @@
*/
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -22,19 +23,37 @@
#include <asm/irq_vectors.h>
#include <asm/timer.h>
-struct msg_desc {
- struct bau_payload_queue_entry *msg;
- int msg_slot;
- int sw_ack_slot;
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
+/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
+static int timeout_base_ns[] = {
+ 20,
+ 160,
+ 1280,
+ 10240,
+ 81920,
+ 655360,
+ 5242880,
+ 167772160
};
-
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-
-static int uv_bau_max_concurrent __read_mostly;
-
+static int timeout_us;
static int nobau;
+static int baudisabled;
+static spinlock_t disable_lock;
+static cycles_t congested_cycles;
+
+/* tunables: */
+static int max_bau_concurrent = MAX_BAU_CONCURRENT;
+static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_response_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int congested_period = CONGESTED_PERIOD;
+static struct dentry *tunables_dir;
+static struct dentry *tunables_file;
+
static int __init setup_nobau(char *arg)
{
nobau = 1;
@@ -52,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-struct reset_args {
- int sender;
-};
-
/*
* Determine the first node on a uvhub. 'Nodes' are used for kernel
* memory allocation.
@@ -126,7 +141,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
struct ptc_stats *stat;
msg = mdp->msg;
- stat = &per_cpu(ptcstats, bcp->cpu);
+ stat = bcp->statp;
stat->d_retries++;
/*
* cancel any message from msg+1 to the retry itself
@@ -146,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
slot2 = msg2 - mdp->va_queue_first;
mmr = uv_read_local_mmr
(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = ((msg2->sw_ack_vector << 8) |
- msg2->sw_ack_vector);
+ msg_res = msg2->sw_ack_vector;
/*
* This is a message retry; clear the resources held
* by the previous message only if they timed out.
* If it has not timed out we have an unexpected
* situation to report.
*/
- if (mmr & (msg_res << 8)) {
+ if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
/*
* is the resource timed out?
* make everyone ignore the cancelled message.
@@ -164,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
cancel_count++;
uv_write_local_mmr(
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- (msg_res << 8) | msg_res);
- } else
- printk(KERN_INFO "note bau retry: no effect\n");
+ (msg_res << UV_SW_ACK_NPENDING) |
+ msg_res);
+ }
}
}
if (!cancel_count)
@@ -190,7 +204,7 @@ static void uv_bau_process_message(struct msg_desc *mdp,
* This must be a normal message, or retry of a normal message
*/
msg = mdp->msg;
- stat = &per_cpu(ptcstats, bcp->cpu);
+ stat = bcp->statp;
if (msg->address == TLB_FLUSH_ALL) {
local_flush_tlb();
stat->d_alltlb++;
@@ -274,7 +288,7 @@ uv_do_reset(void *ptr)
bcp = &per_cpu(bau_control, smp_processor_id());
rap = (struct reset_args *)ptr;
- stat = &per_cpu(ptcstats, bcp->cpu);
+ stat = bcp->statp;
stat->d_resets++;
/*
@@ -302,13 +316,13 @@ uv_do_reset(void *ptr)
*/
mmr = uv_read_local_mmr
(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = ((msg->sw_ack_vector << 8) |
- msg->sw_ack_vector);
+ msg_res = msg->sw_ack_vector;
if (mmr & msg_res) {
stat->d_rcanceled++;
uv_write_local_mmr(
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- msg_res);
+ (msg_res << UV_SW_ACK_NPENDING) |
+ msg_res);
}
}
}
@@ -386,17 +400,12 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
unsigned long mmr_offset, int right_shift, int this_cpu,
struct bau_control *bcp, struct bau_control *smaster, long try)
{
- int relaxes = 0;
unsigned long descriptor_status;
- unsigned long mmr;
- unsigned long mask;
cycles_t ttime;
- cycles_t timeout_time;
- struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+ struct ptc_stats *stat = bcp->statp;
struct bau_control *hmaster;
hmaster = bcp->uvhub_master;
- timeout_time = get_cycles() + bcp->timeout_interval;
/* spin on the status MMR, waiting for it to go idle */
while ((descriptor_status = (((unsigned long)
@@ -423,7 +432,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
* pending. In that case hardware returns the
* ERROR that looks like a destination timeout.
*/
- if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+ if (cycles_2_us(ttime - bcp->send_message) <
+ timeout_us) {
bcp->conseccompletes = 0;
return FLUSH_RETRY_PLUGGED;
}
@@ -435,26 +445,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
* descriptor_status is still BUSY
*/
cpu_relax();
- relaxes++;
- if (relaxes >= 10000) {
- relaxes = 0;
- if (get_cycles() > timeout_time) {
- quiesce_local_uvhub(hmaster);
-
- /* single-thread the register change */
- spin_lock(&hmaster->masks_lock);
- mmr = uv_read_local_mmr(mmr_offset);
- mask = 0UL;
- mask |= (3UL < right_shift);
- mask = ~mask;
- mmr &= mask;
- uv_write_local_mmr(mmr_offset, mmr);
- spin_unlock(&hmaster->masks_lock);
- end_uvhub_quiesce(hmaster);
- stat->s_busy++;
- return FLUSH_GIVEUP;
- }
- }
}
}
bcp->conseccompletes++;
@@ -494,56 +484,116 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
return 1;
}
+/*
+ * Our retries are blocked by all destination swack resources being
+ * in use, and a timeout is pending. In that case hardware immediately
+ * returns the ERROR that looks like a destination timeout.
+ */
+static void
+destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+ struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ udelay(bcp->plugged_delay);
+ bcp->plugged_tries++;
+ if (bcp->plugged_tries >= bcp->plugsb4reset) {
+ bcp->plugged_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_plug++;
+ }
+}
+
+static void
+destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
+ struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ hmaster->max_bau_concurrent = 1;
+ bcp->timeout_tries++;
+ if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
+ bcp->timeout_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_timeout++;
+ }
+}
+
+/*
+ * Completions are taking a very long time due to a congested numalink
+ * network.
+ */
+static void
+disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+{
+ int tcpu;
+ struct bau_control *tbcp;
+
+ /* let only one cpu do this disabling */
+ spin_lock(&disable_lock);
+ if (!baudisabled && bcp->period_requests &&
+ ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+ /* it becomes this cpu's job to turn on the use of the
+ BAU again */
+ baudisabled = 1;
+ bcp->set_bau_off = 1;
+ bcp->set_bau_on_time = get_cycles() +
+ sec_2_cycles(bcp->congested_period);
+ stat->s_bau_disabled++;
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ tbcp->baudisabled = 1;
+ }
+ }
+ spin_unlock(&disable_lock);
+}
+
/**
* uv_flush_send_and_wait
*
* Send a broadcast and wait for it to complete.
*
- * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * The flush_mask contains the cpus the broadcast is to be sent to including
* cpus that are on the local uvhub.
*
- * Returns NULL if all flushing represented in the mask was done. The mask
- * is zeroed.
- * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set, representing any cpus on the local
- * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
+ * Returns 0 if all flushing represented in the mask was done.
+ * Returns 1 if it gives up entirely and the original cpu mask is to be
+ * returned to the kernel.
*/
-const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
- struct cpumask *flush_mask,
- struct bau_control *bcp)
+int uv_flush_send_and_wait(struct bau_desc *bau_desc,
+ struct cpumask *flush_mask, struct bau_control *bcp)
{
int right_shift;
- int uvhub;
- int bit;
int completion_status = 0;
int seq_number = 0;
long try = 0;
int cpu = bcp->uvhub_cpu;
int this_cpu = bcp->cpu;
- int this_uvhub = bcp->uvhub;
unsigned long mmr_offset;
unsigned long index;
cycles_t time1;
cycles_t time2;
- struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+ cycles_t elapsed;
+ struct ptc_stats *stat = bcp->statp;
struct bau_control *smaster = bcp->socket_master;
struct bau_control *hmaster = bcp->uvhub_master;
- /*
- * Spin here while there are hmaster->max_concurrent or more active
- * descriptors. This is the per-uvhub 'throttle'.
- */
if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
&hmaster->active_descriptor_count,
- hmaster->max_concurrent)) {
+ hmaster->max_bau_concurrent)) {
stat->s_throttles++;
do {
cpu_relax();
} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
&hmaster->active_descriptor_count,
- hmaster->max_concurrent));
+ hmaster->max_bau_concurrent));
}
-
while (hmaster->uvhub_quiesce)
cpu_relax();
@@ -557,23 +607,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
}
time1 = get_cycles();
do {
- /*
- * Every message from any given cpu gets a unique message
- * sequence number. But retries use that same number.
- * Our message may have timed out at the destination because
- * all sw-ack resources are in use and there is a timeout
- * pending there. In that case, our last send never got
- * placed into the queue and we need to persist until it
- * does.
- *
- * Make any retry a type MSG_RETRY so that the destination will
- * free any resource held by a previous message from this cpu.
- */
if (try == 0) {
- /* use message type set by the caller the first time */
+ bau_desc->header.msg_type = MSG_REGULAR;
seq_number = bcp->message_number++;
} else {
- /* use RETRY type on all the rest; same sequence */
bau_desc->header.msg_type = MSG_RETRY;
stat->s_retry_messages++;
}
@@ -581,50 +618,17 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
bcp->uvhub_cpu;
bcp->send_message = get_cycles();
-
uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
-
try++;
completion_status = uv_wait_completion(bau_desc, mmr_offset,
right_shift, this_cpu, bcp, smaster, try);
if (completion_status == FLUSH_RETRY_PLUGGED) {
- /*
- * Our retries may be blocked by all destination swack
- * resources being consumed, and a timeout pending. In
- * that case hardware immediately returns the ERROR
- * that looks like a destination timeout.
- */
- udelay(TIMEOUT_DELAY);
- bcp->plugged_tries++;
- if (bcp->plugged_tries >= PLUGSB4RESET) {
- bcp->plugged_tries = 0;
- quiesce_local_uvhub(hmaster);
- spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution,
- this_cpu);
- spin_unlock(&hmaster->queue_lock);
- end_uvhub_quiesce(hmaster);
- bcp->ipi_attempts++;
- stat->s_resets_plug++;
- }
+ destination_plugged(bau_desc, bcp, hmaster, stat);
} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
- hmaster->max_concurrent = 1;
- bcp->timeout_tries++;
- udelay(TIMEOUT_DELAY);
- if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
- bcp->timeout_tries = 0;
- quiesce_local_uvhub(hmaster);
- spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution,
- this_cpu);
- spin_unlock(&hmaster->queue_lock);
- end_uvhub_quiesce(hmaster);
- bcp->ipi_attempts++;
- stat->s_resets_timeout++;
- }
+ destination_timeout(bau_desc, bcp, hmaster, stat);
}
- if (bcp->ipi_attempts >= 3) {
+ if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
bcp->ipi_attempts = 0;
completion_status = FLUSH_GIVEUP;
break;
@@ -633,49 +637,36 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
(completion_status == FLUSH_RETRY_TIMEOUT));
time2 = get_cycles();
-
- if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
- && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
- hmaster->max_concurrent++;
-
- /*
- * hold any cpu not timing out here; no other cpu currently held by
- * the 'throttle' should enter the activation code
- */
+ bcp->plugged_tries = 0;
+ bcp->timeout_tries = 0;
+ if ((completion_status == FLUSH_COMPLETE) &&
+ (bcp->conseccompletes > bcp->complete_threshold) &&
+ (hmaster->max_bau_concurrent <
+ hmaster->max_bau_concurrent_constant))
+ hmaster->max_bau_concurrent++;
while (hmaster->uvhub_quiesce)
cpu_relax();
atomic_dec(&hmaster->active_descriptor_count);
-
- /* guard against cycles wrap */
- if (time2 > time1)
- stat->s_time += (time2 - time1);
- else
- stat->s_requestor--; /* don't count this one */
+ if (time2 > time1) {
+ elapsed = time2 - time1;
+ stat->s_time += elapsed;
+ if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+ bcp->period_requests++;
+ bcp->period_time += elapsed;
+ if ((elapsed > congested_cycles) &&
+ (bcp->period_requests > bcp->congested_reps)) {
+ disable_for_congestion(bcp, stat);
+ }
+ }
+ } else
+ stat->s_requestor--;
if (completion_status == FLUSH_COMPLETE && try > 1)
stat->s_retriesok++;
else if (completion_status == FLUSH_GIVEUP) {
- /*
- * Cause the caller to do an IPI-style TLB shootdown on
- * the target cpu's, all of which are still in the mask.
- */
stat->s_giveup++;
- return flush_mask;
- }
-
- /*
- * Success, so clear the remote cpu's from the mask so we don't
- * use the IPI method of shootdown on them.
- */
- for_each_cpu(bit, flush_mask) {
- uvhub = uv_cpu_to_blade_id(bit);
- if (uvhub == this_uvhub)
- continue;
- cpumask_clear_cpu(bit, flush_mask);
+ return 1;
}
- if (!cpumask_empty(flush_mask))
- return flush_mask;
-
- return NULL;
+ return 0;
}
/**
@@ -707,70 +698,89 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long va, unsigned int cpu)
{
- int remotes;
int tcpu;
int uvhub;
int locals = 0;
+ int remotes = 0;
+ int hubs = 0;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
struct ptc_stats *stat;
struct bau_control *bcp;
+ struct bau_control *tbcp;
+ /* kernel was booted 'nobau' */
if (nobau)
return cpumask;
bcp = &per_cpu(bau_control, cpu);
+ stat = bcp->statp;
+
+ /* bau was disabled due to slow response */
+ if (bcp->baudisabled) {
+ /* the cpu that disabled it must re-enable it */
+ if (bcp->set_bau_off) {
+ if (get_cycles() >= bcp->set_bau_on_time) {
+ stat->s_bau_reenabled++;
+ baudisabled = 0;
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ tbcp->baudisabled = 0;
+ tbcp->period_requests = 0;
+ tbcp->period_time = 0;
+ }
+ }
+ }
+ return cpumask;
+ }
+
/*
* Each sending cpu has a per-cpu mask which it fills from the caller's
- * cpu mask. Only remote cpus are converted to uvhubs and copied.
+ * cpu mask. All cpus are converted to uvhubs and copied to the
+ * activation descriptor.
*/
flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
- /*
- * copy cpumask to flush_mask, removing current cpu
- * (current cpu should already have been flushed by the caller and
- * should never be returned if we return flush_mask)
- */
+ /* don't actually do a shootdown of the local cpu */
cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
if (cpu_isset(cpu, *cpumask))
- locals++; /* current cpu was targeted */
+ stat->s_ntargself++;
bau_desc = bcp->descriptor_base;
bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
-
bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
- remotes = 0;
+
+ /* cpu statistics */
for_each_cpu(tcpu, flush_mask) {
uvhub = uv_cpu_to_blade_id(tcpu);
- if (uvhub == bcp->uvhub) {
- locals++;
- continue;
- }
bau_uvhub_set(uvhub, &bau_desc->distribution);
- remotes++;
- }
- if (remotes == 0) {
- /*
- * No off_hub flushing; return status for local hub.
- * Return the caller's mask if all were local (the current
- * cpu may be in that mask).
- */
- if (locals)
- return cpumask;
+ if (uvhub == bcp->uvhub)
+ locals++;
else
- return NULL;
+ remotes++;
}
- stat = &per_cpu(ptcstats, cpu);
+ if ((locals + remotes) == 0)
+ return NULL;
stat->s_requestor++;
- stat->s_ntargcpu += remotes;
+ stat->s_ntargcpu += remotes + locals;
+ stat->s_ntargremotes += remotes;
+ stat->s_ntarglocals += locals;
remotes = bau_uvhub_weight(&bau_desc->distribution);
- stat->s_ntarguvhub += remotes;
- if (remotes >= 16)
+
+ /* uvhub statistics */
+ hubs = bau_uvhub_weight(&bau_desc->distribution);
+ if (locals) {
+ stat->s_ntarglocaluvhub++;
+ stat->s_ntargremoteuvhub += (hubs - 1);
+ } else
+ stat->s_ntargremoteuvhub += hubs;
+ stat->s_ntarguvhub += hubs;
+ if (hubs >= 16)
stat->s_ntarguvhub16++;
- else if (remotes >= 8)
+ else if (hubs >= 8)
stat->s_ntarguvhub8++;
- else if (remotes >= 4)
+ else if (hubs >= 4)
stat->s_ntarguvhub4++;
- else if (remotes >= 2)
+ else if (hubs >= 2)
stat->s_ntarguvhub2++;
else
stat->s_ntarguvhub1++;
@@ -779,10 +789,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
bau_desc->payload.sending_cpu = cpu;
/*
- * uv_flush_send_and_wait returns null if all cpu's were messaged, or
- * the adjusted flush_mask if any cpu's were not messaged.
+ * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
+ * or 1 if it gave up and the original cpumask should be returned.
*/
- return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
+ if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+ return NULL;
+ else
+ return cpumask;
}
/*
@@ -810,7 +823,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
time_start = get_cycles();
bcp = &per_cpu(bau_control, smp_processor_id());
- stat = &per_cpu(ptcstats, smp_processor_id());
+ stat = bcp->statp;
msgdesc.va_queue_first = bcp->va_queue_first;
msgdesc.va_queue_last = bcp->va_queue_last;
msg = bcp->bau_msg_head;
@@ -908,12 +921,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
}
static inline unsigned long long
-millisec_2_cycles(unsigned long millisec)
+microsec_2_cycles(unsigned long microsec)
{
unsigned long ns;
unsigned long long cyc;
- ns = millisec * 1000;
+ ns = microsec * 1000;
cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
return cyc;
}
@@ -931,15 +944,19 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
if (!cpu) {
seq_printf(file,
- "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
+ "# cpu sent stime self locals remotes ncpus localhub ");
+ seq_printf(file,
+ "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
seq_printf(file,
- "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+ "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
seq_printf(file,
"retries rok resetp resett giveup sto bz throt ");
seq_printf(file,
"sw_ack recv rtime all ");
seq_printf(file,
- "one mult none retry canc nocan reset rcan\n");
+ "one mult none retry canc nocan reset rcan ");
+ seq_printf(file,
+ "disable enable\n");
}
if (cpu < num_possible_cpus() && cpu_online(cpu)) {
stat = &per_cpu(ptcstats, cpu);
@@ -947,18 +964,23 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
seq_printf(file,
"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
cpu, stat->s_requestor, cycles_2_us(stat->s_time),
- stat->s_ntarguvhub, stat->s_ntarguvhub16,
+ stat->s_ntargself, stat->s_ntarglocals,
+ stat->s_ntargremotes, stat->s_ntargcpu,
+ stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
+ stat->s_ntarguvhub, stat->s_ntarguvhub16);
+ seq_printf(file, "%ld %ld %ld %ld %ld ",
stat->s_ntarguvhub8, stat->s_ntarguvhub4,
stat->s_ntarguvhub2, stat->s_ntarguvhub1,
- stat->s_ntargcpu, stat->s_dtimeout);
+ stat->s_dtimeout);
seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
stat->s_retry_messages, stat->s_retriesok,
stat->s_resets_plug, stat->s_resets_timeout,
stat->s_giveup, stat->s_stimeout,
stat->s_busy, stat->s_throttles);
+
/* destination side statistics */
seq_printf(file,
- "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
+ "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
stat->d_requestee, cycles_2_us(stat->d_time),
@@ -966,15 +988,36 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
stat->d_nomsg, stat->d_retries, stat->d_canceled,
stat->d_nocanceled, stat->d_resets,
stat->d_rcanceled);
+ seq_printf(file, "%ld %ld\n",
+ stat->s_bau_disabled, stat->s_bau_reenabled);
}
return 0;
}
/*
+ * Display the tunables thru debugfs
+ */
+static ssize_t tunables_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[300];
+ int ret;
+
+ ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
+ "max_bau_concurrent plugged_delay plugsb4reset",
+ "timeoutsb4reset ipi_reset_limit complete_threshold",
+ "congested_response_us congested_reps congested_period",
+ max_bau_concurrent, plugged_delay, plugsb4reset,
+ timeoutsb4reset, ipi_reset_limit, complete_threshold,
+ congested_response_us, congested_reps, congested_period);
+
+ return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
+}
+
+/*
* -1: resetf the statistics
* 0: display meaning of the statistics
- * >0: maximum concurrent active descriptors per uvhub (throttle)
*/
static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
size_t count, loff_t *data)
@@ -983,7 +1026,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
long input_arg;
char optstr[64];
struct ptc_stats *stat;
- struct bau_control *bcp;
if (count == 0 || count > sizeof(optstr))
return -EINVAL;
@@ -1059,29 +1101,158 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
"reset: number of ipi-style reset requests processed\n");
printk(KERN_DEBUG
"rcan: number messages canceled by reset requests\n");
+ printk(KERN_DEBUG
+ "disable: number times use of the BAU was disabled\n");
+ printk(KERN_DEBUG
+ "enable: number times use of the BAU was re-enabled\n");
} else if (input_arg == -1) {
for_each_present_cpu(cpu) {
stat = &per_cpu(ptcstats, cpu);
memset(stat, 0, sizeof(struct ptc_stats));
}
- } else {
- uv_bau_max_concurrent = input_arg;
- bcp = &per_cpu(bau_control, smp_processor_id());
- if (uv_bau_max_concurrent < 1 ||
- uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
- printk(KERN_DEBUG
- "Error: BAU max concurrent %d; %d is invalid\n",
- bcp->max_concurrent, uv_bau_max_concurrent);
- return -EINVAL;
- }
- printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
- uv_bau_max_concurrent);
- for_each_present_cpu(cpu) {
- bcp = &per_cpu(bau_control, cpu);
- bcp->max_concurrent = uv_bau_max_concurrent;
+ }
+
+ return count;
+}
+
+static int local_atoi(const char *name)
+{
+ int val = 0;
+
+ for (;; name++) {
+ switch (*name) {
+ case '0' ... '9':
+ val = 10*val+(*name-'0');
+ break;
+ default:
+ return val;
}
}
+}
+
+/*
+ * set the tunables
+ * 0 values reset them to defaults
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
+{
+ int cpu;
+ int cnt = 0;
+ int val;
+ char *p;
+ char *q;
+ char instr[64];
+ struct bau_control *bcp;
+
+ if (count == 0 || count > sizeof(instr)-1)
+ return -EINVAL;
+ if (copy_from_user(instr, user, count))
+ return -EFAULT;
+ instr[count] = '\0';
+ /* count the fields */
+ p = instr + strspn(instr, WHITESPACE);
+ q = p;
+ for (; *p; p = q + strspn(q, WHITESPACE)) {
+ q = p + strcspn(p, WHITESPACE);
+ cnt++;
+ if (q == p)
+ break;
+ }
+ if (cnt != 9) {
+ printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+ return -EINVAL;
+ }
+
+ p = instr + strspn(instr, WHITESPACE);
+ q = p;
+ for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
+ q = p + strcspn(p, WHITESPACE);
+ val = local_atoi(p);
+ switch (cnt) {
+ case 0:
+ if (val == 0) {
+ max_bau_concurrent = MAX_BAU_CONCURRENT;
+ max_bau_concurrent_constant =
+ MAX_BAU_CONCURRENT;
+ continue;
+ }
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ if (val < 1 || val > bcp->cpus_in_uvhub) {
+ printk(KERN_DEBUG
+ "Error: BAU max concurrent %d is invalid\n",
+ val);
+ return -EINVAL;
+ }
+ max_bau_concurrent = val;
+ max_bau_concurrent_constant = val;
+ continue;
+ case 1:
+ if (val == 0)
+ plugged_delay = PLUGGED_DELAY;
+ else
+ plugged_delay = val;
+ continue;
+ case 2:
+ if (val == 0)
+ plugsb4reset = PLUGSB4RESET;
+ else
+ plugsb4reset = val;
+ continue;
+ case 3:
+ if (val == 0)
+ timeoutsb4reset = TIMEOUTSB4RESET;
+ else
+ timeoutsb4reset = val;
+ continue;
+ case 4:
+ if (val == 0)
+ ipi_reset_limit = IPI_RESET_LIMIT;
+ else
+ ipi_reset_limit = val;
+ continue;
+ case 5:
+ if (val == 0)
+ complete_threshold = COMPLETE_THRESHOLD;
+ else
+ complete_threshold = val;
+ continue;
+ case 6:
+ if (val == 0)
+ congested_response_us = CONGESTED_RESPONSE_US;
+ else
+ congested_response_us = val;
+ continue;
+ case 7:
+ if (val == 0)
+ congested_reps = CONGESTED_REPS;
+ else
+ congested_reps = val;
+ continue;
+ case 8:
+ if (val == 0)
+ congested_period = CONGESTED_PERIOD;
+ else
+ congested_period = val;
+ continue;
+ }
+ if (q == p)
+ break;
+ }
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->max_bau_concurrent = max_bau_concurrent;
+ bcp->max_bau_concurrent_constant = max_bau_concurrent;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->congested_response_us = congested_response_us;
+ bcp->congested_reps = congested_reps;
+ bcp->congested_period = congested_period;
+ }
return count;
}
@@ -1097,6 +1268,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file)
return seq_open(file, &uv_ptc_seq_ops);
}
+static int tunables_open(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
static const struct file_operations proc_uv_ptc_operations = {
.open = uv_ptc_proc_open,
.read = seq_read,
@@ -1105,6 +1281,12 @@ static const struct file_operations proc_uv_ptc_operations = {
.release = seq_release,
};
+static const struct file_operations tunables_fops = {
+ .open = tunables_open,
+ .read = tunables_read,
+ .write = tunables_write,
+};
+
static int __init uv_ptc_init(void)
{
struct proc_dir_entry *proc_uv_ptc;
@@ -1119,6 +1301,20 @@ static int __init uv_ptc_init(void)
UV_PTC_BASENAME);
return -EINVAL;
}
+
+ tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
+ if (!tunables_dir) {
+ printk(KERN_ERR "unable to create debugfs directory %s\n",
+ UV_BAU_TUNABLES_DIR);
+ return -EINVAL;
+ }
+ tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
+ tunables_dir, NULL, &tunables_fops);
+ if (!tunables_file) {
+ printk(KERN_ERR "unable to create debugfs file %s\n",
+ UV_BAU_TUNABLES_FILE);
+ return -EINVAL;
+ }
return 0;
}
@@ -1259,15 +1455,45 @@ static void __init uv_init_uvhub(int uvhub, int vector)
}
/*
+ * We will set BAU_MISC_CONTROL with a timeout period.
+ * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
+ * So the destination timeout period has be be calculated from them.
+ */
+static int
+calculate_destination_timeout(void)
+{
+ unsigned long mmr_image;
+ int mult1;
+ int mult2;
+ int index;
+ int base;
+ int ret;
+ unsigned long ts_ns;
+
+ mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+ mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+ index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+ mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+ mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+ base = timeout_base_ns[index];
+ ts_ns = base * mult1 * mult2;
+ ret = ts_ns / 1000;
+ return ret;
+}
+
+/*
* initialize the bau_control structure for each cpu
*/
-static void uv_init_per_cpu(int nuvhubs)
+static void __init uv_init_per_cpu(int nuvhubs)
{
- int i, j, k;
+ int i;
int cpu;
int pnode;
int uvhub;
+ int have_hmaster;
short socket = 0;
+ unsigned short socket_mask;
+ unsigned char *uvhub_mask;
struct bau_control *bcp;
struct uvhub_desc *bdp;
struct socket_desc *sdp;
@@ -1278,7 +1504,7 @@ static void uv_init_per_cpu(int nuvhubs)
short cpu_number[16];
};
struct uvhub_desc {
- short num_sockets;
+ unsigned short socket_mask;
short num_cpus;
short uvhub;
short pnode;
@@ -1286,57 +1512,84 @@ static void uv_init_per_cpu(int nuvhubs)
};
struct uvhub_desc *uvhub_descs;
+ timeout_us = calculate_destination_timeout();
+
uvhub_descs = (struct uvhub_desc *)
kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+ uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
for_each_present_cpu(cpu) {
bcp = &per_cpu(bau_control, cpu);
memset(bcp, 0, sizeof(struct bau_control));
- spin_lock_init(&bcp->masks_lock);
- bcp->max_concurrent = uv_bau_max_concurrent;
pnode = uv_cpu_hub_info(cpu)->pnode;
uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+ *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
bdp = &uvhub_descs[uvhub];
bdp->num_cpus++;
bdp->uvhub = uvhub;
bdp->pnode = pnode;
- /* time interval to catch a hardware stay-busy bug */
- bcp->timeout_interval = millisec_2_cycles(3);
- /* kludge: assume uv_hub.h is constant */
- socket = (cpu_physical_id(cpu)>>5)&1;
- if (socket >= bdp->num_sockets)
- bdp->num_sockets = socket+1;
+ /* kludge: 'assuming' one node per socket, and assuming that
+ disabling a socket just leaves a gap in node numbers */
+ socket = (cpu_to_node(cpu) & 1);
+ bdp->socket_mask |= (1 << socket);
sdp = &bdp->socket[socket];
sdp->cpu_number[sdp->num_cpus] = cpu;
sdp->num_cpus++;
}
- socket = 0;
- for_each_possible_blade(uvhub) {
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
+ continue;
+ have_hmaster = 0;
bdp = &uvhub_descs[uvhub];
- for (i = 0; i < bdp->num_sockets; i++) {
- sdp = &bdp->socket[i];
- for (j = 0; j < sdp->num_cpus; j++) {
- cpu = sdp->cpu_number[j];
+ socket_mask = bdp->socket_mask;
+ socket = 0;
+ while (socket_mask) {
+ if (!(socket_mask & 1))
+ goto nextsocket;
+ sdp = &bdp->socket[socket];
+ for (i = 0; i < sdp->num_cpus; i++) {
+ cpu = sdp->cpu_number[i];
bcp = &per_cpu(bau_control, cpu);
bcp->cpu = cpu;
- if (j == 0) {
+ if (i == 0) {
smaster = bcp;
- if (i == 0)
+ if (!have_hmaster) {
+ have_hmaster++;
hmaster = bcp;
+ }
}
bcp->cpus_in_uvhub = bdp->num_cpus;
bcp->cpus_in_socket = sdp->num_cpus;
bcp->socket_master = smaster;
+ bcp->uvhub = bdp->uvhub;
bcp->uvhub_master = hmaster;
- for (k = 0; k < DEST_Q_SIZE; k++)
- bcp->socket_acknowledge_count[k] = 0;
- bcp->uvhub_cpu =
- uv_cpu_hub_info(cpu)->blade_processor_id;
+ bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
+ blade_processor_id;
}
+nextsocket:
socket++;
+ socket_mask = (socket_mask >> 1);
}
}
kfree(uvhub_descs);
+ kfree(uvhub_mask);
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->baudisabled = 0;
+ bcp->statp = &per_cpu(ptcstats, cpu);
+ /* time interval to catch a hardware stay-busy bug */
+ bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
+ bcp->max_bau_concurrent = max_bau_concurrent;
+ bcp->max_bau_concurrent_constant = max_bau_concurrent;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->congested_response_us = congested_response_us;
+ bcp->congested_reps = congested_reps;
+ bcp->congested_period = congested_period;
+ }
}
/*
@@ -1361,10 +1614,11 @@ static int __init uv_bau_init(void)
zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
GFP_KERNEL, cpu_to_node(cur_cpu));
- uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
uv_nshift = uv_hub_info->m_val;
uv_mmask = (1UL << uv_hub_info->m_val) - 1;
nuvhubs = uv_num_possible_blades();
+ spin_lock_init(&disable_lock);
+ congested_cycles = microsec_2_cycles(congested_response_us);
uv_init_per_cpu(nuvhubs);
@@ -1383,15 +1637,19 @@ static int __init uv_bau_init(void)
alloc_intr_gate(vector, uv_bau_message_intr1);
for_each_possible_blade(uvhub) {
- pnode = uv_blade_to_pnode(uvhub);
- /* INIT the bau */
- uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
- ((unsigned long)1 << 63));
- mmr = 1; /* should be 1 to broadcast to both sockets */
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+ if (uv_blade_nr_possible_cpus(uvhub)) {
+ pnode = uv_blade_to_pnode(uvhub);
+ /* INIT the bau */
+ uv_write_global_mmr64(pnode,
+ UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+ ((unsigned long)1 << 63));
+ mmr = 1; /* should be 1 to broadcast to both sockets */
+ uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
+ mmr);
+ }
}
return 0;
}
core_initcall(uv_bau_init);
-core_initcall(uv_ptc_init);
+fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index c652ef62742d..e2a595257390 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,6 +1,7 @@
#include <linux/io.h>
#include <asm/trampoline.h>
+#include <asm/pgtable.h>
#include <asm/e820.h>
#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
@@ -37,3 +38,19 @@ unsigned long __trampinit setup_trampoline(void)
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
}
+
+void __init setup_trampoline_page_table(void)
+{
+#ifdef CONFIG_X86_32
+ /* Copy kernel address range */
+ clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+
+ /* Initialize low mappings */
+ clone_pgd_range(trampoline_pg_dir,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min_t(unsigned long, KERNEL_PGD_PTRS,
+ KERNEL_PGD_BOUNDARY));
+#endif
+}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ce8e50239332..26a863a9c2a8 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -626,6 +626,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
local_irq_restore(flags);
}
+static unsigned long long cyc2ns_suspend;
+
+void save_sched_clock_state(void)
+{
+ if (!sched_clock_stable)
+ return;
+
+ cyc2ns_suspend = sched_clock();
+}
+
+/*
+ * Even on processors with invariant TSC, TSC gets reset in some the
+ * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
+ * arbitrary value (still sync'd across cpu's) during resume from such sleep
+ * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
+ * that sched_clock() continues from the point where it was left off during
+ * suspend.
+ */
+void restore_sched_clock_state(void)
+{
+ unsigned long long offset;
+ unsigned long flags;
+ int cpu;
+
+ if (!sched_clock_stable)
+ return;
+
+ local_irq_save(flags);
+
+ __get_cpu_var(cyc2ns_offset) = 0;
+ offset = cyc2ns_suspend - sched_clock();
+
+ for_each_possible_cpu(cpu)
+ per_cpu(cyc2ns_offset, cpu) = offset;
+
+ local_irq_restore(flags);
+}
+
#ifdef CONFIG_CPU_FREQ
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb7526..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/pgalloc.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-#include <asm/setup.h>
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
- (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
- (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
- void (*cpuid)(void /* non-c */);
- void (*_set_ldt)(u32 selector);
- void (*set_tr)(u32 selector);
- void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
- void (*set_kernel_stack)(u32 selector, u32 sp0);
- void (*allocate_page)(u32, u32, u32, u32, u32);
- void (*release_page)(u32, u32);
- void (*set_pte)(pte_t, pte_t *, unsigned);
- void (*update_pte)(pte_t *, unsigned);
- void (*set_linear_mapping)(int, void *, u32, u32);
- void (*_flush_tlb)(int);
- void (*set_initial_ap_state)(int, int);
- void (*halt)(void);
- void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP 0xe9
-#define MNEM_RET 0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE 5
-
-static inline void patch_offset(void *insnbuf,
- unsigned long ip, unsigned long dest)
-{
- *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
- unsigned long ip)
-{
- u64 reloc;
- struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
- switch(rel->type) {
- case VMI_RELOCATION_CALL_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_CALL;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_JUMP_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_JMP;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_NOP:
- /* obliterate the whole thing */
- return 0;
-
- case VMI_RELOCATION_NONE:
- /* leave native code in place */
- break;
-
- default:
- BUG();
- }
- return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence. The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
- unsigned long ip, unsigned len)
-{
- switch (type) {
- case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
- return patch_internal(VMI_CALL_DisableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
- return patch_internal(VMI_CALL_EnableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
- return patch_internal(VMI_CALL_SetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.save_fl):
- return patch_internal(VMI_CALL_GetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.iret):
- return patch_internal(VMI_CALL_IRET, len, insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
- return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
- default:
- break;
- }
- return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int override = 0;
- if (*ax == 1)
- override = 1;
- asm volatile ("call *%6"
- : "=a" (*ax),
- "=b" (*bx),
- "=c" (*cx),
- "=d" (*dx)
- : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
- if (override) {
- if (disable_pse)
- *dx &= ~X86_FEATURE_PSE;
- if (disable_pge)
- *dx &= ~X86_FEATURE_PGE;
- if (disable_sep)
- *dx &= ~X86_FEATURE_SEP;
- if (disable_tsc)
- *dx &= ~X86_FEATURE_TSC;
- if (disable_mtrr)
- *dx &= ~X86_FEATURE_MTRR;
- }
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
- if (gdt[nr].a != new->a || gdt[nr].b != new->b)
- write_gdt_entry(gdt, nr, new, 0);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
- unsigned cpu = smp_processor_id();
- struct desc_struct desc;
-
- pack_descriptor(&desc, (unsigned long)addr,
- entries * sizeof(struct desc_struct) - 1,
- DESC_LDT, 0);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
- vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
- vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
- u32 *idt_entry = (u32 *)g;
- vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
- const void *desc, int type)
-{
- u32 *gdt_entry = (u32 *)desc;
- vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
- const void *desc)
-{
- u32 *ldt_entry = (u32 *)desc;
- vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-
-static void vmi_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- tss->x86_tss.sp0 = thread->sp0;
-
- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
- vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- /*
- * This call comes in very early, before mem_map is setup.
- * It is called only for swapper_pg_dir, which already has
- * data on it.
- */
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pte(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-
-static void vmi_release_pmd(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * Helper macros for MMU update flags. We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush). We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs. We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
- (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
- /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
- vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
- const pte_t pte = { .pte = pmdval.pmd };
-#else
- const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
- vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
- /*
- * XXX This is called from set_pmd_pte, but at both PT
- * and PD layers so the VMI_PAGE_PT flag is wrong. But
- * it is only called for large page mapping changes,
- * the Xen backend, doesn't support large pages, and the
- * ESX backend doesn't depend on the flag.
- */
- set_64bit((unsigned long long *)ptep,pte_val(pteval));
- vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
- /* Um, eww */
- const pte_t pte = { .pte = pudval.pgd.pgd };
- vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
- unsigned long start_esp)
-{
- struct vmi_ap_state ap;
-
- /* Default everything to zero. This is fine for most GPRs. */
- memset(&ap, 0, sizeof(struct vmi_ap_state));
-
- ap.gdtr_limit = GDT_SIZE - 1;
- ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
- ap.idtr_limit = IDT_ENTRIES * 8 - 1;
- ap.idtr_base = (unsigned long) idt_table;
-
- ap.ldtr = 0;
-
- ap.cs = __KERNEL_CS;
- ap.eip = (unsigned long) start_eip;
- ap.ss = __KERNEL_DS;
- ap.esp = (unsigned long) start_esp;
-
- ap.ds = __USER_DS;
- ap.es = __USER_DS;
- ap.fs = __KERNEL_PERCPU;
- ap.gs = __KERNEL_STACK_CANARY;
-
- ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
- /* efer should match BSP efer. */
- if (cpu_has_nx) {
- unsigned l, h;
- rdmsr(MSR_EFER, l, h);
- ap.efer = (unsigned long long) h << 32 | l;
- }
-#endif
-
- ap.cr3 = __pa(swapper_pg_dir);
- /* Protected mode, paging, AM, WP, NE, MP. */
- ap.cr0 = 0x80050023;
- ap.cr4 = mmu_cr4_features;
- vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_start_context_switch(struct task_struct *prev)
-{
- paravirt_start_context_switch(prev);
- vmi_ops.set_lazy_mode(2);
-}
-
-static void vmi_end_context_switch(struct task_struct *next)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_end_context_switch(next);
-}
-
-static void vmi_enter_lazy_mmu(void)
-{
- paravirt_enter_lazy_mmu();
- vmi_ops.set_lazy_mode(1);
-}
-
-static void vmi_leave_lazy_mmu(void)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_leave_lazy_mmu();
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
- struct pci_header *pci;
- struct pnp_header *pnp;
- const char *manufacturer = "UNKNOWN";
- const char *product = "UNKNOWN";
- const char *license = "unspecified";
-
- if (rom->rom_signature != 0xaa55)
- return 0;
- if (rom->vrom_signature != VMI_SIGNATURE)
- return 0;
- if (rom->api_version_maj != VMI_API_REV_MAJOR ||
- rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
- printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
- rom->api_version_maj,
- rom->api_version_min);
- return 0;
- }
-
- /*
- * Relying on the VMI_SIGNATURE field is not 100% safe, so check
- * the PCI header and device type to make sure this is really a
- * VMI device.
- */
- if (!rom->pci_header_offs) {
- printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
- return 0;
- }
-
- pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
- if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
- pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
- /* Allow it to run... anyways, but warn */
- printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
- }
-
- if (rom->pnp_header_offs) {
- pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
- if (pnp->manufacturer_offset)
- manufacturer = (const char *)rom+pnp->manufacturer_offset;
- if (pnp->product_offset)
- product = (const char *)rom+pnp->product_offset;
- }
-
- if (rom->license_offs)
- license = (char *)rom+rom->license_offs;
-
- printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
- manufacturer, product,
- rom->api_version_maj, rom->api_version_min,
- pci->rom_version_maj, pci->rom_version_min);
-
- /* Don't allow BSD/MIT here for now because we don't want to end up
- with any binary only shim layers */
- if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
- printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
- license);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
- unsigned long base;
-
- /* VMI ROM is in option ROM area, check signature */
- for (base = 0xC0000; base < 0xE0000; base += 2048) {
- struct vrom_header *romstart;
- romstart = (struct vrom_header *)isa_bus_to_virt(base);
- if (check_vmi_rom(romstart)) {
- vmi_rom = romstart;
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
- /* We must establish the lowmem mapping for MMU ops to work */
- if (vmi_ops.set_linear_mapping)
- vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
- if (rel->type == VMI_RELOCATION_CALL_REL)
- return (void *)rel->eip;
- else
- return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- if (rel->type == VMI_RELOCATION_CALL_REL) \
- opname = (void *)rel->eip; \
- else if (rel->type == VMI_RELOCATION_NOP) \
- opname = (void *)vmi_nop; \
- else if (rel->type != VMI_RELOCATION_NONE) \
- printk(KERN_WARNING "VMI: Unknown relocation " \
- "type %d for " #vmicall"\n",\
- rel->type); \
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub. Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
- if (rel->type == VMI_RELOCATION_CALL_REL) { \
- opname = wrapper; \
- vmi_ops.cache = (void *)rel->eip; \
- } \
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
- short kernel_cs;
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
- /*
- * Prevent page tables from being allocated in highmem, even if
- * CONFIG_HIGHPTE is enabled.
- */
- __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
- if (call_vrom_func(vmi_rom, vmi_init) != 0) {
- printk(KERN_ERR "VMI ROM failed to initialize!");
- return 0;
- }
- savesegment(cs, kernel_cs);
-
- pv_info.paravirt_enabled = 1;
- pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
- pv_info.name = "vmi [deprecated]";
-
- pv_init_ops.patch = vmi_patch;
-
- /*
- * Many of these operations are ABI compatible with VMI.
- * This means we can fill in the paravirt-ops with direct
- * pointers into the VMI ROM. If the calling convention for
- * these operations changes, this code needs to be updated.
- *
- * Exceptions
- * CPUID paravirt-op uses pointers, not the native ISA
- * halt has no VMI equivalent; all VMI halts are "safe"
- * no MSR support yet - just trap and emulate. VMI uses the
- * same ABI as the native ISA, but Linux wants exceptions
- * from bogus MSR read / write handled
- * rdpmc is not yet used in Linux
- */
-
- /* CPUID is special, so very special it gets wrapped like a present */
- para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-
- para_fill(pv_cpu_ops.clts, CLTS);
- para_fill(pv_cpu_ops.get_debugreg, GetDR);
- para_fill(pv_cpu_ops.set_debugreg, SetDR);
- para_fill(pv_cpu_ops.read_cr0, GetCR0);
- para_fill(pv_mmu_ops.read_cr2, GetCR2);
- para_fill(pv_mmu_ops.read_cr3, GetCR3);
- para_fill(pv_cpu_ops.read_cr4, GetCR4);
- para_fill(pv_cpu_ops.write_cr0, SetCR0);
- para_fill(pv_mmu_ops.write_cr2, SetCR2);
- para_fill(pv_mmu_ops.write_cr3, SetCR3);
- para_fill(pv_cpu_ops.write_cr4, SetCR4);
-
- para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
- para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
- para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
- para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-
- para_fill(pv_cpu_ops.wbinvd, WBINVD);
- para_fill(pv_cpu_ops.read_tsc, RDTSC);
-
- /* The following we emulate with trap and emulate for now */
- /* paravirt_ops.read_msr = vmi_rdmsr */
- /* paravirt_ops.write_msr = vmi_wrmsr */
- /* paravirt_ops.rdpmc = vmi_rdpmc */
-
- /* TR interface doesn't pass TR value, wrap */
- para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
- /* LDT is special, too */
- para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
- para_fill(pv_cpu_ops.load_gdt, SetGDT);
- para_fill(pv_cpu_ops.load_idt, SetIDT);
- para_fill(pv_cpu_ops.store_gdt, GetGDT);
- para_fill(pv_cpu_ops.store_idt, GetIDT);
- para_fill(pv_cpu_ops.store_tr, GetTR);
- pv_cpu_ops.load_tls = vmi_load_tls;
- para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
- write_ldt_entry, WriteLDTEntry);
- para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
- write_gdt_entry, WriteGDTEntry);
- para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
- write_idt_entry, WriteIDTEntry);
- para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
- para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
- para_fill(pv_cpu_ops.io_delay, IODelay);
-
- para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
- set_lazy_mode, SetLazyMode);
-
- para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
- set_lazy_mode, SetLazyMode);
-
- /* user and kernel flush are just handled with different flags to FlushTLB */
- para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
- para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
- para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-
- /*
- * Until a standard flag format can be agreed on, we need to
- * implement these as wrappers in Linux. Get the VMI ROM
- * function pointers for the two backend calls.
- */
-#ifdef CONFIG_X86_PAE
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
- if (vmi_ops.set_pte) {
- pv_mmu_ops.set_pte = vmi_set_pte;
- pv_mmu_ops.set_pte_at = vmi_set_pte_at;
- pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
- pv_mmu_ops.set_pud = vmi_set_pud;
- pv_mmu_ops.pte_clear = vmi_pte_clear;
- pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
- }
-
- if (vmi_ops.update_pte) {
- pv_mmu_ops.pte_update = vmi_update_pte;
- pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
- }
-
- vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
- if (vmi_ops.allocate_page) {
- pv_mmu_ops.alloc_pte = vmi_allocate_pte;
- pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
- pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
- }
-
- vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
- if (vmi_ops.release_page) {
- pv_mmu_ops.release_pte = vmi_release_pte;
- pv_mmu_ops.release_pmd = vmi_release_pmd;
- pv_mmu_ops.pgd_free = vmi_pgd_free;
- }
-
- /* Set linear is needed in all cases */
- vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-
- /*
- * These MUST always be patched. Don't support indirect jumps
- * through these operations, as the VMI interface may use either
- * a jump or a call to get to these operations, depending on
- * the backend. They are performance critical anyway, so requiring
- * a patch is not a big problem.
- */
- pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
- pv_cpu_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
- para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- para_fill(apic->read, APICRead);
- para_fill(apic->write, APICWrite);
-#endif
-
- /*
- * Check for VMI timer functionality by probing for a cycle frequency method
- */
- reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
- if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
- vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
- vmi_timer_ops.get_cycle_counter =
- vmi_get_function(VMI_CALL_GetCycleCounter);
- vmi_timer_ops.get_wallclock =
- vmi_get_function(VMI_CALL_GetWallclockTime);
- vmi_timer_ops.wallclock_updated =
- vmi_get_function(VMI_CALL_WallclockUpdated);
- vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
- vmi_timer_ops.cancel_alarm =
- vmi_get_function(VMI_CALL_CancelAlarm);
- x86_init.timers.timer_init = vmi_time_init;
-#ifdef CONFIG_X86_LOCAL_APIC
- x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
- x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
-#endif
- pv_time_ops.sched_clock = vmi_sched_clock;
- x86_platform.calibrate_tsc = vmi_tsc_khz;
- x86_platform.get_wallclock = vmi_get_wallclock;
- x86_platform.set_wallclock = vmi_set_wallclock;
-
- /* We have true wallclock functions; disable CMOS clock sync */
- no_sync_cmos_clock = 1;
- } else {
- disable_noidle = 1;
- disable_vmi_timer = 1;
- }
-
- para_fill(pv_irq_ops.safe_halt, Halt);
-
- /*
- * Alternative instruction rewriting doesn't happen soon enough
- * to convert VMI_IRET to a call instead of a jump; so we have
- * to do this before IRQs get reenabled. Fortunately, it is
- * idempotent.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
- vmi_bringup();
-
- return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
- if (!vmi_rom)
- probe_vmi_rom();
- else
- check_vmi_rom(vmi_rom);
-
- /* In case probing for or validating the ROM failed, basil */
- if (!vmi_rom)
- return;
-
- reserve_top_address(-vmi_rom->virtual_top);
-
-#ifdef CONFIG_X86_IO_APIC
- /* This is virtual hardware; timer routing is wired correctly */
- no_timer_check = 1;
-#endif
-}
-
-void __init vmi_activate(void)
-{
- unsigned long flags;
-
- if (!vmi_rom)
- return;
-
- local_irq_save(flags);
- activate_vmi();
- local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "disable_pge")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
- disable_pge = 1;
- } else if (!strcmp(arg, "disable_pse")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
- disable_pse = 1;
- } else if (!strcmp(arg, "disable_sep")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
- disable_sep = 1;
- } else if (!strcmp(arg, "disable_tsc")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
- disable_tsc = 1;
- } else if (!strcmp(arg, "disable_mtrr")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
- disable_mtrr = 1;
- } else if (!strcmp(arg, "disable_timer")) {
- disable_vmi_timer = 1;
- disable_noidle = 1;
- } else if (!strcmp(arg, "disable_noidle"))
- disable_noidle = 1;
- return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd73..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-#include <asm/irq_vectors.h>
-
-#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
- /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
- * cycle counter. */
- return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
- unsigned long long wallclock;
- wallclock = vmi_timer_ops.get_wallclock(); // nsec
- (void)do_div(wallclock, 1000000000); // sec
-
- return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
- return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
- return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* x86_platform.calibrate_tsc = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
- unsigned long long khz;
- khz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(khz, 1000);
- return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
- return IRQ0_VECTOR;
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-
- return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
- .name = "VMI-LOCAL",
- .startup = startup_timer_irq,
- .mask = mask_timer_irq,
- .unmask = unmask_timer_irq,
- .ack = ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
- return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- cycle_t now, cycles_per_hz;
- BUG_ON(!irqs_disabled());
-
- switch (mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- case CLOCK_EVT_MODE_RESUME:
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_hz, HZ);
- now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
- vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- switch (evt->mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
- break;
- default:
- break;
- }
- break;
- default:
- break;
- }
-}
-
-static int vmi_timer_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* Unfortunately, set_next_event interface only passes relative
- * expiry, but we want absolute expiry. It'd be better if were
- * were passed an absolute expiry, since a bunch of time may
- * have been stolen between the time the delta is computed and
- * when we set the alarm below. */
- cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
- BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
- vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
- return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
- .name = "vmi-timer",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .shift = 22,
- .set_mode = vmi_timer_set_mode,
- .set_next_event = vmi_timer_next_event,
- .rating = 1000,
- .irq = 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *evt = &__get_cpu_var(local_events);
- evt->event_handler(evt);
- return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action = {
- .name = "vmi-timer",
- .handler = vmi_timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
- cycle_t cycles_per_msec;
- struct clock_event_device *evt;
-
- int cpu = smp_processor_id();
- evt = &__get_cpu_var(local_events);
-
- /* Use cycles_per_msec since div_sc params are 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- memcpy(evt, &vmi_clockevent, sizeof(*evt));
- /* Must pick .shift such that .mult fits in 32-bits. Choosing
- * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
- * before overflow. */
- evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
- /* Upper bound is clockevent's use of ulong for cycle deltas. */
- evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
- evt->min_delta_ns = clockevent_delta2ns(1, evt);
- evt->cpumask = cpumask_of(cpu);
-
- printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
- evt->name, evt->mult, evt->shift);
- clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
- unsigned int cpu;
- /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
- outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
- vmi_time_init_clockevent();
- setup_irq(0, &vmi_clock_action);
- for_each_possible_cpu(cpu)
- per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
- /*
- * On APIC systems, we want local timers to fire on each cpu. We do
- * this by programming LVTT to deliver timer events to the IRQ handler
- * for IRQ-0, since we can't re-use the APIC local timer handler
- * without interfering with that code.
- */
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
- local_irq_disable();
-#ifdef CONFIG_SMP
- /*
- * XXX handle_percpu_irq only defined for SMP; we need to switch over
- * to using it, since this is a local interrupt, which each CPU must
- * handle individually without locking out or dropping simultaneous
- * local timers on other CPUs. We also don't want to trigger the
- * quirk workaround code for interrupts which gets invoked from
- * handle_percpu_irq via eoi, so we use our own IRQ chip.
- */
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
- vmi_wiring = VMI_ALARM_WIRED_LVTT;
- apic_write(APIC_LVTT, vmi_get_timer_vector());
- local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
- vmi_time_init_clockevent();
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
- cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
- return max(ret, clocksource_vmi.cycle_last);
-}
-
-static struct clocksource clocksource_vmi = {
- .name = "vmi-timer",
- .rating = 450,
- .read = read_real_cycles,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be set */
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
- cycle_t cycles_per_msec;
-
- if (!vmi_timer_ops.get_cycle_frequency)
- return 0;
- /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- /* Note that clocksource.{mult, shift} converts in the opposite direction
- * as clockevents. */
- clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
- clocksource_vmi.shift);
-
- printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
- return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);